From 4ba9d711ba0a5bf0be00936d3258d4a5e4164d4f Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sat, 19 Apr 2025 22:28:40 -0700
Subject: [PATCH 001/200] metal: add neg operator (#13029)

---
 ggml/src/ggml-metal/ggml-metal.m     | 15 +++++++++++++++
 ggml/src/ggml-metal/ggml-metal.metal |  7 +++++++
 2 files changed, 22 insertions(+)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 85f3ae7bfdc31..266d8af4693c2 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -481,6 +481,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_SQRT,
     GGML_METAL_KERNEL_TYPE_SIN,
     GGML_METAL_KERNEL_TYPE_COS,
+    GGML_METAL_KERNEL_TYPE_NEG,
     GGML_METAL_KERNEL_TYPE_SUM_ROWS,
     GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
     GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
@@ -1159,6 +1160,7 @@ @implementation GGMLMetalClass
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT,                            sqrt,                            true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN,                             sin,                             true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                             cos,                             true);
+        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_NEG,                             neg,                             true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ARGMAX,                          argmax,                          true);
         GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,                 pool_2d_avg_f32,                 true);
@@ -1320,6 +1322,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_SILU:
                 case GGML_UNARY_OP_ELU:
+                case GGML_UNARY_OP_NEG:
                     return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
                 default:
                     return false;
@@ -2010,6 +2013,18 @@ static void ggml_metal_encode_node(
 
                     [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                 } break;
+                case GGML_UNARY_OP_NEG:
+                {
+                    id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_NEG].pipeline;
+
+                    [encoder setComputePipelineState:pipeline];
+                    [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+                    [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
+
+                    const int64_t n = ggml_nelements(dst);
+
+                    [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                } break;
                 default:
                 {
                     GGML_LOG_WARN("%s: node %3d, op = %8s not implemented\n", __func__, idx, ggml_op_name(dst->op));
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index dc7eab03ee8a2..8d6e99e621e9e 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -949,6 +949,13 @@ kernel void kernel_cos(
     dst[tpig] = cos(src0[tpig]);
 }
 
+kernel void kernel_neg(
+        device const float * src0,
+        device       float * dst,
+        uint tpig[[thread_position_in_grid]]) {
+    dst[tpig] = -src0[tpig];
+}
+
 kernel void kernel_sum_rows(
         device const float * src0,
         device       float * dst,

From 66168204be9559dc841f06f0025f3da01d9a8546 Mon Sep 17 00:00:00 2001
From: Jeff Bolz <jbolz@nvidia.com>
Date: Sun, 20 Apr 2025 03:50:02 -0500
Subject: [PATCH 002/200] vulkan: support noncontiguous rms_norm (#13031)

---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp          | 22 ++++++++++---
 .../ggml-vulkan/vulkan-shaders/rms_norm.comp  | 32 ++++++++++++-------
 2 files changed, 39 insertions(+), 15 deletions(-)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 2f6d03c939eb5..39f3cd343ac45 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -2397,7 +2397,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
 
     ggml_vk_create_pipeline(device, device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_group_norm_f32, "group_norm_f32", group_norm_f32_len, group_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(device, device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {1, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_rms_norm_back_f32, "rms_norm_back_f32", rms_norm_back_f32_len, rms_norm_back_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
     ggml_vk_create_pipeline(device, device->pipeline_l2_norm_f32, "l2_norm_f32", l2_norm_f32_len, l2_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
 
@@ -6006,6 +6006,7 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
     case GGML_OP_REPEAT:
     case GGML_OP_REPEAT_BACK:
     case GGML_OP_ROPE:
+    case GGML_OP_RMS_NORM:
         return true;
     default:
         return false;
@@ -6216,7 +6217,6 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
 
     switch (op) {
     case GGML_OP_NORM:
-    case GGML_OP_RMS_NORM:
     case GGML_OP_RMS_NORM_BACK:
     case GGML_OP_L2_NORM:
     case GGML_OP_SOFT_MAX:
@@ -6233,6 +6233,10 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
                 elements = { nr, 1, 1 };
             }
         } break;
+    case GGML_OP_RMS_NORM:
+        elements = { (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne03 };
+        break;
+
     case GGML_OP_SUM:
         // We use GGML_OP_SUM_ROWS with 1 row.
         elements = { 1, 1, 1 };
@@ -6883,7 +6887,17 @@ static void ggml_vk_group_norm(ggml_backend_vk_context * ctx, vk_context& subctx
 
 static void ggml_vk_rms_norm(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
     float * op_params = (float *)dst->op_params;
-    ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, { (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], op_params[0], 0.0f }, dryrun);
+    const uint32_t src0_type_size = ggml_type_size(src0->type);
+    const uint32_t dst_type_size = ggml_type_size(dst->type);
+
+    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_RMS_NORM, {
+        (uint32_t)ggml_nelements(src0),
+        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
+        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
+        0,
+        op_params[0], 0.0f,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    }, dryrun);
 }
 
 static void ggml_vk_rms_norm_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
@@ -9388,10 +9402,10 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
         case GGML_OP_VIEW:
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
+        case GGML_OP_RMS_NORM:
             return true;
         case GGML_OP_NORM:
         case GGML_OP_GROUP_NORM:
-        case GGML_OP_RMS_NORM:
         case GGML_OP_L2_NORM:
             return ggml_is_contiguous(op->src[0]);
         case GGML_OP_ADD:
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp
index b554400ba393f..deb8ee9960f58 100644
--- a/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/rms_norm.comp
@@ -1,6 +1,6 @@
 #version 450
 
-#include "generic_head.comp"
+#include "generic_unary_head.comp"
 #include "types.comp"
 
 #extension GL_EXT_control_flow_attributes : enable
@@ -8,19 +8,29 @@
 
 layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
 
-layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
-layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
-
 shared FLOAT_TYPE sum[BLOCK_SIZE];
 
 void main() {
-    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
-    const uint tid = gl_LocalInvocationID.x;
+    const uint ncols     = p.ne00;
+    const uint nrows     = gl_NumWorkGroups.x;
+    const uint nchannels = gl_NumWorkGroups.y;
+
+    const uint row       = gl_WorkGroupID.x;
+    const uint channel   = gl_WorkGroupID.y;
+    const uint samp      = gl_WorkGroupID.z;
+    const uint tid       = gl_LocalInvocationID.x;
+
+    const uint stride_row       = p.nb01;
+    const uint stride_channel   = p.nb02;
+    const uint stride_sample    = p.nb03;
+
+    uint32_t a_offset = samp*stride_sample + channel*stride_channel + row*stride_row + get_aoffset();
+    uint32_t d_offset = ((samp*nchannels + channel)*nrows + row)*ncols + get_doffset();
 
     sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
 
-    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
-        const FLOAT_TYPE xi = FLOAT_TYPE(data_a[row*p.KX + col]);
+    [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) {
+        const FLOAT_TYPE xi = FLOAT_TYPE(data_a[a_offset + col]);
         sum[tid] += xi * xi;
     }
 
@@ -33,10 +43,10 @@ void main() {
         barrier();
     }
 
-    const FLOAT_TYPE mean = sum[0] / FLOAT_TYPE(p.KX);
+    const FLOAT_TYPE mean = sum[0] / FLOAT_TYPE(ncols);
     const FLOAT_TYPE scale = inversesqrt(mean + FLOAT_TYPE(p.param1));
 
-    [[unroll]] for (uint col = tid; col < p.KX; col += BLOCK_SIZE) {
-        data_d[row*p.KX + col] = D_TYPE(scale * FLOAT_TYPE(data_a[row*p.KX + col]));
+    [[unroll]] for (uint col = tid; col < ncols; col += BLOCK_SIZE) {
+        data_d[d_offset + col] = D_TYPE(scale * FLOAT_TYPE(data_a[a_offset + col]));
     }
 }

From 6602304814e679cc8c162bb760a034aceb4f8965 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sun, 20 Apr 2025 03:15:41 -0700
Subject: [PATCH 003/200] llava: fix errors in clip.h on certain compilers
 (#13030)

---
 examples/llava/clip.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index cc133a58de3e8..5fc45d3e23904 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -30,12 +30,13 @@ struct clip_image_size {
     int height;
 };
 
+struct clip_image_f32;
 struct clip_image_u8_batch;
 struct clip_image_f32_batch;
 
 struct clip_context_params {
     bool use_gpu;
-    ggml_log_level verbosity;
+    enum ggml_log_level verbosity;
 };
 
 // deprecated, use clip_init
@@ -84,7 +85,7 @@ CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
 CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
 CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
 CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
-CLIP_API clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
+CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
 
 /**
  * Build image from pixels decoded by other libraries instead of stb_image.h for better performance.

From 2016f07bd106c73699ecbaace80f55db5ed95dac Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Sun, 20 Apr 2025 23:29:36 +0200
Subject: [PATCH 004/200] convert : experimental support for `--mmproj` flag
 (#13023)

* convert : experimental support for `--mmproj` flag

* fix bad ctrl+f replace

* fix style

* split into subclasses TextModel and VisionModel

* rename Mode --> ModelBase

* small fix

* correct CLIP_VISION arch name (because existing GGUF already use it)

* Apply suggestions from code review

Co-authored-by: compilade <git@compilade.net>

* fix Mistral3Model

* fix typo

Co-authored-by: compilade <git@compilade.net>

---------

Co-authored-by: compilade <git@compilade.net>
---
 convert_hf_to_gguf.py          | 634 +++++++++++++++++++--------------
 convert_lora_to_gguf.py        |   6 +-
 examples/llava/clip-impl.h     |   3 -
 gguf-py/gguf/constants.py      | 139 +++++++-
 gguf-py/gguf/tensor_mapping.py | 144 ++++++++
 5 files changed, 647 insertions(+), 279 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 89522dee8b8ad..6d34541a3cecc 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -42,11 +42,19 @@ class SentencePieceTokenTypes(IntEnum):
     BYTE = 6
 
 
-AnyModel = TypeVar("AnyModel", bound="type[Model]")
+class ModelType(IntEnum):
+    TEXT = 1
+    VISION = 2
 
 
-class Model:
-    _model_classes: dict[str, type[Model]] = {}
+AnyModel = TypeVar("AnyModel", bound="type[ModelBase]")
+
+
+class ModelBase:
+    _model_classes: dict[ModelType, dict[str, type[ModelBase]]] = {
+        ModelType.TEXT: {},
+        ModelType.VISION: {},
+    }
 
     dir_model: Path
     ftype: gguf.LlamaFileType
@@ -75,7 +83,9 @@ def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path,
                  metadata_override: Path | None = None, model_name: str | None = None,
                  split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
                  small_first_shard: bool = False, hparams: dict[str, Any] | None = None, remote_hf_model_id: str | None = None):
-        if type(self) is Model:
+        if type(self) is ModelBase or \
+                type(self) is TextModel or \
+                type(self) is VisionModel:
             raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
 
         self.dir_model = dir_model
@@ -98,11 +108,11 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]:
 
             self.get_tensors = get_remote_tensors
         else:
-            self.part_names = Model.get_model_part_names(self.dir_model, "model", ".safetensors")
+            self.part_names = ModelBase.get_model_part_names(self.dir_model, "model", ".safetensors")
             self.is_safetensors = len(self.part_names) > 0
             if not self.is_safetensors:
-                self.part_names = Model.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
-        self.hparams = Model.load_hparams(self.dir_model) if hparams is None else hparams
+                self.part_names = ModelBase.get_model_part_names(self.dir_model, "pytorch_model", ".bin")
+        self.hparams = ModelBase.load_hparams(self.dir_model) if hparams is None else hparams
         self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer", "num_layers"])
         self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
         self.tensor_names = None
@@ -126,11 +136,10 @@ def get_remote_tensors() -> Iterator[tuple[str, Tensor]]:
                                            split_max_tensors=split_max_tensors, split_max_size=split_max_size, dry_run=dry_run, small_first_shard=small_first_shard)
 
     @classmethod
-    def __init_subclass__(cls):
-        # can't use an abstract property, because overriding it without type errors
-        # would require using decorated functions instead of simply defining the property
-        if "model_arch" not in cls.__dict__:
-            raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
+    def add_prefix_to_filename(cls, path: Path, prefix: str) -> Path:
+        stem, suffix = path.stem, path.suffix
+        new_name = f"{prefix}{stem}{suffix}"
+        return path.with_name(new_name)
 
     def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
         key = next((k for k in keys if k in self.hparams), None)
@@ -140,9 +149,6 @@ def find_hparam(self, keys: Iterable[str], optional: bool = False) -> Any:
             return None
         raise KeyError(f"could not find any of: {keys}")
 
-    def set_vocab(self):
-        self._set_vocab_gpt2()
-
     def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
         tensor_names_from_parts: set[str] = set()
 
@@ -230,50 +236,7 @@ def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", "
         return new_name
 
     def set_gguf_parameters(self):
-        self.gguf_writer.add_block_count(self.block_count)
-
-        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
-            self.gguf_writer.add_context_length(n_ctx)
-            logger.info(f"gguf: context length = {n_ctx}")
-
-        if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
-            self.gguf_writer.add_embedding_length(n_embd)
-            logger.info(f"gguf: embedding length = {n_embd}")
-
-        if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
-            self.gguf_writer.add_feed_forward_length(n_ff)
-            logger.info(f"gguf: feed forward length = {n_ff}")
-
-        if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
-            self.gguf_writer.add_head_count(n_head)
-            logger.info(f"gguf: head count = {n_head}")
-
-        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
-            self.gguf_writer.add_head_count_kv(n_head_kv)
-            logger.info(f"gguf: key-value head count = {n_head_kv}")
-
-        if (rope_theta := self.hparams.get("rope_theta")) is not None:
-            self.gguf_writer.add_rope_freq_base(rope_theta)
-            logger.info(f"gguf: rope theta = {rope_theta}")
-        if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
-            self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
-            logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
-        if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
-            self.gguf_writer.add_layer_norm_eps(f_norm_eps)
-            logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
-        if (n_experts := self.hparams.get("num_local_experts")) is not None:
-            self.gguf_writer.add_expert_count(n_experts)
-            logger.info(f"gguf: expert count = {n_experts}")
-        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
-            self.gguf_writer.add_expert_used_count(n_experts_used)
-            logger.info(f"gguf: experts used count = {n_experts_used}")
-
-        if (head_dim := self.hparams.get("head_dim")) is not None:
-            self.gguf_writer.add_key_length(head_dim)
-            self.gguf_writer.add_value_length(head_dim)
-
-        self.gguf_writer.add_file_type(self.ftype)
-        logger.info(f"gguf: file type = {self.ftype}")
+        raise NotImplementedError("set_gguf_parameters() must be implemented in subclasses")
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid  # unused
@@ -419,27 +382,6 @@ def prepare_metadata(self, vocab_only: bool):
         if self.metadata.size_label is None and total_params > 0:
             self.metadata.size_label = gguf.size_label(total_params, shared_params, expert_params, expert_count)
 
-        # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
-        output_type: str = self.ftype.name.partition("_")[2]
-
-        # Filename Output
-        if self.fname_out.is_dir():
-            # Generate default filename based on model specification and available metadata
-            if not vocab_only:
-                fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
-            else:
-                fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
-
-            # Use the default filename
-            self.fname_out = self.fname_out / f"{fname_default}.gguf"
-        else:
-            # Output path is a custom defined templated filename
-            # Note: `not is_dir()` is used because `.is_file()` will not detect
-            #       file template strings as it doesn't actually exist as a file
-
-            # Process templated file name with the output ftype, useful with the "auto" ftype
-            self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
-
         self.set_type()
 
         logger.info("Set meta model")
@@ -448,12 +390,12 @@ def prepare_metadata(self, vocab_only: bool):
         logger.info("Set model parameters")
         self.set_gguf_parameters()
 
-        logger.info("Set model tokenizer")
-        self.set_vocab()
-
         logger.info("Set model quantization version")
         self.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
 
+    def write_vocab(self):
+        raise NotImplementedError("write_vocab() must be implemented in subclasses")
+
     def write(self):
         self.prepare_tensors()
         self.prepare_metadata(vocab_only=False)
@@ -462,15 +404,6 @@ def write(self):
         self.gguf_writer.write_tensors_to_file(progress=True)
         self.gguf_writer.close()
 
-    def write_vocab(self):
-        if len(self.gguf_writer.tensors) != 1:
-            raise ValueError('Splitting the vocabulary is not supported')
-
-        self.prepare_metadata(vocab_only=True)
-        self.gguf_writer.write_header_to_file(path=self.fname_out)
-        self.gguf_writer.write_kv_data_to_file()
-        self.gguf_writer.close()
-
     @staticmethod
     def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
         part_names: list[str] = []
@@ -485,30 +418,131 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]
     @staticmethod
     def load_hparams(dir_model: Path):
         with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-            return json.load(f)
+            hparams = json.load(f)
+            if "text_config" in hparams:
+                hparams = {**hparams, **hparams["text_config"]}
+            return hparams
 
     @classmethod
     def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]:
         assert names
 
         def func(modelcls: AnyModel) -> AnyModel:
+            model_type = ModelType.VISION if modelcls.model_arch == gguf.MODEL_ARCH.CLIP_VISION else ModelType.TEXT
             for name in names:
-                cls._model_classes[name] = modelcls
+                cls._model_classes[model_type][name] = modelcls
             return modelcls
         return func
 
     @classmethod
     def print_registered_models(cls):
-        for name in sorted(cls._model_classes.keys()):
-            logger.error(f"- {name}")
+        for model_type, model_classes in cls._model_classes.items():
+            logger.error(f"{model_type.name} models:")
+            for name in sorted(model_classes.keys()):
+                logger.error(f"  - {name}")
 
     @classmethod
-    def from_model_architecture(cls, arch: str) -> type[Model]:
+    def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type[ModelBase]:
         try:
-            return cls._model_classes[arch]
+            return cls._model_classes[model_type][arch]
         except KeyError:
             raise NotImplementedError(f'Architecture {arch!r} not supported!') from None
 
+
+class TextModel(ModelBase):
+    @classmethod
+    def __init_subclass__(cls):
+        # can't use an abstract property, because overriding it without type errors
+        # would require using decorated functions instead of simply defining the property
+        if "model_arch" not in cls.__dict__:
+            raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
+
+    def set_vocab(self):
+        self._set_vocab_gpt2()
+
+    def prepare_metadata(self, vocab_only: bool):
+        super().prepare_metadata(vocab_only=vocab_only)
+
+        total_params = self.gguf_writer.get_total_parameter_count()[0]
+        # Extract the encoding scheme from the file type name. e.g. 'gguf.LlamaFileType.MOSTLY_Q8_0' --> 'Q8_0'
+        output_type: str = self.ftype.name.partition("_")[2]
+
+        # Filename Output
+        if self.fname_out.is_dir():
+            # Generate default filename based on model specification and available metadata
+            if not vocab_only:
+                fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, self.metadata.size_label, output_type, model_type="LoRA" if total_params < 0 else None)
+            else:
+                fname_default: str = gguf.naming_convention(self.metadata.name, self.metadata.basename, self.metadata.finetune, self.metadata.version, size_label=None, output_type=None, model_type="vocab")
+
+            # Use the default filename
+            self.fname_out = self.fname_out / f"{fname_default}.gguf"
+        else:
+            # Output path is a custom defined templated filename
+            # Note: `not is_dir()` is used because `.is_file()` will not detect
+            #       file template strings as it doesn't actually exist as a file
+
+            # Process templated file name with the output ftype, useful with the "auto" ftype
+            self.fname_out = self.fname_out.parent / gguf.fill_templated_filename(self.fname_out.name, output_type)
+
+        logger.info("Set model tokenizer")
+        self.set_vocab()
+
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_block_count(self.block_count)
+
+        if (n_ctx := self.find_hparam(["max_position_embeddings", "n_ctx"], optional=True)) is not None:
+            self.gguf_writer.add_context_length(n_ctx)
+            logger.info(f"gguf: context length = {n_ctx}")
+
+        if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
+            self.gguf_writer.add_embedding_length(n_embd)
+            logger.info(f"gguf: embedding length = {n_embd}")
+
+        if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
+            self.gguf_writer.add_feed_forward_length(n_ff)
+            logger.info(f"gguf: feed forward length = {n_ff}")
+
+        if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
+            self.gguf_writer.add_head_count(n_head)
+            logger.info(f"gguf: head count = {n_head}")
+
+        if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None:
+            self.gguf_writer.add_head_count_kv(n_head_kv)
+            logger.info(f"gguf: key-value head count = {n_head_kv}")
+
+        if (rope_theta := self.hparams.get("rope_theta")) is not None:
+            self.gguf_writer.add_rope_freq_base(rope_theta)
+            logger.info(f"gguf: rope theta = {rope_theta}")
+        if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None:
+            self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
+            logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
+        if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
+            self.gguf_writer.add_layer_norm_eps(f_norm_eps)
+            logger.info(f"gguf: layer norm epsilon = {f_norm_eps}")
+        if (n_experts := self.hparams.get("num_local_experts")) is not None:
+            self.gguf_writer.add_expert_count(n_experts)
+            logger.info(f"gguf: expert count = {n_experts}")
+        if (n_experts_used := self.hparams.get("num_experts_per_tok")) is not None:
+            self.gguf_writer.add_expert_used_count(n_experts_used)
+            logger.info(f"gguf: experts used count = {n_experts_used}")
+
+        if (head_dim := self.hparams.get("head_dim")) is not None:
+            self.gguf_writer.add_key_length(head_dim)
+            self.gguf_writer.add_value_length(head_dim)
+
+        self.gguf_writer.add_file_type(self.ftype)
+        logger.info(f"gguf: file type = {self.ftype}")
+
+    def write_vocab(self):
+        if len(self.gguf_writer.tensors) != 1:
+            raise ValueError('Splitting the vocabulary is not supported')
+
+        self.prepare_metadata(vocab_only=True)
+        self.gguf_writer.write_header_to_file(path=self.fname_out)
+        self.gguf_writer.write_kv_data_to_file()
+        self.gguf_writer.close()
+
     def does_token_look_special(self, token: str | bytes) -> bool:
         if isinstance(token, (bytes, bytearray)):
             token_text = token.decode(encoding="utf-8")
@@ -1024,8 +1058,48 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab
             self.gguf_writer.add_add_eos_token(field.parts[-1].tolist()[0])
 
 
-@Model.register("GPTNeoXForCausalLM")
-class GPTNeoXModel(Model):
+class VisionModel(ModelBase):
+    model_arch = gguf.MODEL_ARCH.CLIP_VISION
+    n_text_embd = 0
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self.model_arch != gguf.MODEL_ARCH.CLIP_VISION:
+            raise TypeError("VisionModel must be subclassed with model_arch = gguf.MODEL_ARCH.CLIP_VISION")
+
+        # small hack to correct the number of layers
+        self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.CLIP_VISION, 128)
+        self.n_embd_text = self.find_hparam(["hidden_size", "n_embd"])
+        assert self.n_embd_text > 0, "n_embd not found in hparams"
+
+        if "vision_config" not in self.hparams:
+            raise ValueError("vision_config not found in hparams")
+        # move vision config to the top level
+        self.hparams = self.hparams["vision_config"]
+
+    def set_type(self):
+        self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION)
+
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_file_type(self.ftype)
+        self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PROJECTION_DIM, self.n_embd_text)
+        self.gguf_writer.add_bool(gguf.Keys.ClipVision.HAS_VISION_ENCODER, True)
+
+        # vision config
+        self.gguf_writer.add_uint32(gguf.Keys.ClipVision.IMAGE_SIZE,           self.find_hparam(["image_size"]))
+        self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PATCH_SIZE,           self.find_hparam(["patch_size"]))
+        self.gguf_writer.add_uint32(gguf.Keys.ClipVision.EMBEDDING_LENGTH,     self.find_hparam(["hidden_size"]))
+        self.gguf_writer.add_uint32(gguf.Keys.ClipVision.FEED_FORWARD_LENGTH,  self.find_hparam(["intermediate_size"]))
+        self.gguf_writer.add_uint32(gguf.Keys.ClipVision.BLOCK_COUNT,          self.find_hparam(["num_hidden_layers"]))
+        self.gguf_writer.add_uint32(gguf.Keys.ClipVision.Attention.HEAD_COUNT, self.find_hparam(["num_attention_heads"]))
+
+    def write_vocab(self):
+        raise ValueError("VisionModel does not support vocab writing")
+
+
+@ModelBase.register("GPTNeoXForCausalLM")
+class GPTNeoXModel(TextModel):
     model_arch = gguf.MODEL_ARCH.GPTNEOX
 
     def set_gguf_parameters(self):
@@ -1081,8 +1155,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return tensors
 
 
-@Model.register("BloomForCausalLM", "BloomModel")
-class BloomModel(Model):
+@ModelBase.register("BloomForCausalLM", "BloomModel")
+class BloomModel(TextModel):
     model_arch = gguf.MODEL_ARCH.BLOOM
 
     def set_gguf_parameters(self):
@@ -1138,8 +1212,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return tensors
 
 
-@Model.register("MPTForCausalLM")
-class MPTModel(Model):
+@ModelBase.register("MPTForCausalLM")
+class MPTModel(TextModel):
     model_arch = gguf.MODEL_ARCH.MPT
 
     def set_vocab(self):
@@ -1182,8 +1256,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(new_name, data_torch)]
 
 
-@Model.register("OrionForCausalLM")
-class OrionModel(Model):
+@ModelBase.register("OrionForCausalLM")
+class OrionModel(TextModel):
     model_arch = gguf.MODEL_ARCH.ORION
 
     def set_vocab(self):
@@ -1217,8 +1291,8 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_layer_norm_eps(self.hparams["rms_norm_eps"])
 
 
-@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
-class BaichuanModel(Model):
+@ModelBase.register("BaichuanForCausalLM", "BaiChuanForCausalLM")
+class BaichuanModel(TextModel):
     model_arch = gguf.MODEL_ARCH.BAICHUAN
 
     def set_vocab(self):
@@ -1297,8 +1371,8 @@ def _reverse_hf_part(self, weights: Tensor, n_part: int) -> Tensor:
         return weights[r * n_part:r * n_part + r, ...]
 
 
-@Model.register("XverseForCausalLM")
-class XverseModel(Model):
+@ModelBase.register("XverseForCausalLM")
+class XverseModel(TextModel):
     model_arch = gguf.MODEL_ARCH.XVERSE
 
     def set_vocab(self):
@@ -1404,8 +1478,8 @@ def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | Non
         )
 
 
-@Model.register("FalconForCausalLM", "RWForCausalLM")
-class FalconModel(Model):
+@ModelBase.register("FalconForCausalLM", "RWForCausalLM")
+class FalconModel(TextModel):
     model_arch = gguf.MODEL_ARCH.FALCON
 
     def set_gguf_parameters(self):
@@ -1458,8 +1532,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("GPTBigCodeForCausalLM")
-class StarCoderModel(Model):
+@ModelBase.register("GPTBigCodeForCausalLM")
+class StarCoderModel(TextModel):
     model_arch = gguf.MODEL_ARCH.STARCODER
 
     def set_gguf_parameters(self):
@@ -1475,8 +1549,8 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_file_type(self.ftype)
 
 
-@Model.register("GPTRefactForCausalLM")
-class RefactModel(Model):
+@ModelBase.register("GPTRefactForCausalLM")
+class RefactModel(TextModel):
     model_arch = gguf.MODEL_ARCH.REFACT
 
     def set_vocab(self):
@@ -1539,8 +1613,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return tensors
 
 
-@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
-class StableLMModel(Model):
+@ModelBase.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM")
+class StableLMModel(TextModel):
     model_arch = gguf.MODEL_ARCH.STABLELM
 
     def set_vocab(self):
@@ -1629,8 +1703,8 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed norms: {norms}")
 
 
-@Model.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
-class LlamaModel(Model):
+@ModelBase.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
+class LlamaModel(TextModel):
     model_arch = gguf.MODEL_ARCH.LLAMA
     undo_permute = True
 
@@ -1778,23 +1852,13 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
-@Model.register("Llama4ForConditionalGeneration")
+@ModelBase.register("Llama4ForConditionalGeneration")
 class Llama4Model(LlamaModel):
     model_arch = gguf.MODEL_ARCH.LLAMA4
-    has_vision: bool = False
     undo_permute = False
 
-    # TODO @ngxson : avoid duplicate this code everywhere by at least support "text_config"
-    # same with llama, but we need to merge the text_config into the root level of hparams
     def __init__(self, *args, **kwargs):
-        hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
-        if "text_config" in hparams:
-            hparams = {**hparams, **hparams["text_config"]}
-            kwargs["hparams"] = hparams
         super().__init__(*args, **kwargs)
-        if "vision_config" in hparams:
-            logger.info("Has vision encoder, but it will be ignored")
-            self.has_vision = True
         # IMPORTANT: the normal "intermediate_size" is renamed to "intermediate_size_mlp", we need to undo this
         self.hparams["intermediate_size_moe"] = self.hparams["intermediate_size"]
         self.hparams["intermediate_size"] = self.hparams["intermediate_size_mlp"]
@@ -1829,18 +1893,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
         return super().modify_tensors(data_torch, name, bid)
 
 
-@Model.register("Mistral3ForConditionalGeneration")
+@ModelBase.register("Mistral3ForConditionalGeneration")
 class Mistral3Model(LlamaModel):
     model_arch = gguf.MODEL_ARCH.LLAMA
 
-    # we need to merge the text_config into the root level of hparams
-    def __init__(self, *args, **kwargs):
-        hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
-        if "text_config" in hparams:
-            hparams = {**hparams, **hparams["text_config"]}
-            kwargs["hparams"] = hparams
-        super().__init__(*args, **kwargs)
-
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
         name = name.replace("language_model.", "")
         if "multi_modal_projector" in name or "vision_tower" in name:
@@ -1848,8 +1904,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None):
         return super().modify_tensors(data_torch, name, bid)
 
 
-@Model.register("DeciLMForCausalLM")
-class DeciModel(Model):
+@ModelBase.register("DeciLMForCausalLM")
+class DeciModel(TextModel):
     model_arch = gguf.MODEL_ARCH.DECI
 
     @staticmethod
@@ -2020,8 +2076,8 @@ def prepare_tensors(self):
         super().prepare_tensors()
 
 
-@Model.register("BitnetForCausalLM")
-class BitnetModel(Model):
+@ModelBase.register("BitnetForCausalLM")
+class BitnetModel(TextModel):
     model_arch = gguf.MODEL_ARCH.BITNET
 
     def set_vocab(self):
@@ -2061,8 +2117,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         yield (new_name, data_torch)
 
 
-@Model.register("GrokForCausalLM")
-class GrokModel(Model):
+@ModelBase.register("GrokForCausalLM")
+class GrokModel(TextModel):
     model_arch = gguf.MODEL_ARCH.GROK
 
     def set_vocab(self):
@@ -2114,8 +2170,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("DbrxForCausalLM")
-class DbrxModel(Model):
+@ModelBase.register("DbrxForCausalLM")
+class DbrxModel(TextModel):
     model_arch = gguf.MODEL_ARCH.DBRX
 
     def set_gguf_parameters(self):
@@ -2183,8 +2239,8 @@ def tensor_force_quant(self, name: str, new_name: str, bid: int | None, n_dims:
         return n_dims > 1
 
 
-@Model.register("MiniCPMForCausalLM")
-class MiniCPMModel(Model):
+@ModelBase.register("MiniCPMForCausalLM")
+class MiniCPMModel(TextModel):
     model_arch = gguf.MODEL_ARCH.MINICPM
 
     def set_gguf_parameters(self):
@@ -2238,8 +2294,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("MiniCPM3ForCausalLM")
-class MiniCPM3Model(Model):
+@ModelBase.register("MiniCPM3ForCausalLM")
+class MiniCPM3Model(TextModel):
     model_arch = gguf.MODEL_ARCH.MINICPM3
 
     def set_gguf_parameters(self):
@@ -2291,8 +2347,8 @@ def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | Non
         )
 
 
-@Model.register("QWenLMHeadModel")
-class QwenModel(Model):
+@ModelBase.register("QWenLMHeadModel")
+class QwenModel(TextModel):
     model_arch = gguf.MODEL_ARCH.QWEN
 
     @staticmethod
@@ -2333,8 +2389,8 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_file_type(self.ftype)
 
 
-@Model.register("Qwen2ForCausalLM")
-class Qwen2Model(Model):
+@ModelBase.register("Qwen2ForCausalLM")
+class Qwen2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.QWEN2
 
     def set_vocab(self):
@@ -2352,8 +2408,8 @@ def set_gguf_parameters(self):
                 self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
 
 
-@Model.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
-class Qwen2VLModel(Model):
+@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
+class Qwen2VLModel(TextModel):
     model_arch = gguf.MODEL_ARCH.QWEN2VL
 
     def set_gguf_parameters(self):
@@ -2375,8 +2431,8 @@ def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
             yield name, data
 
 
-@Model.register("WavTokenizerDec")
-class WavTokenizerDecModel(Model):
+@ModelBase.register("WavTokenizerDec")
+class WavTokenizerDecModel(TextModel):
     model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
@@ -2413,8 +2469,8 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_causal_attention(False)
 
 
-@Model.register("Qwen2MoeForCausalLM")
-class Qwen2MoeModel(Model):
+@ModelBase.register("Qwen2MoeForCausalLM")
+class Qwen2MoeModel(TextModel):
     model_arch = gguf.MODEL_ARCH.QWEN2MOE
 
     def set_gguf_parameters(self):
@@ -2476,18 +2532,18 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
-@Model.register("Qwen3ForCausalLM")
+@ModelBase.register("Qwen3ForCausalLM")
 class Qwen3Model(Qwen2Model):
     model_arch = gguf.MODEL_ARCH.QWEN3
 
 
-@Model.register("Qwen3MoeForCausalLM")
+@ModelBase.register("Qwen3MoeForCausalLM")
 class Qwen3MoeModel(Qwen2MoeModel):
     model_arch = gguf.MODEL_ARCH.QWEN3MOE
 
 
-@Model.register("GPT2LMHeadModel")
-class GPT2Model(Model):
+@ModelBase.register("GPT2LMHeadModel")
+class GPT2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.GPT2
 
     def set_gguf_parameters(self):
@@ -2518,8 +2574,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return tensors
 
 
-@Model.register("PhiForCausalLM")
-class Phi2Model(Model):
+@ModelBase.register("PhiForCausalLM")
+class Phi2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.PHI2
 
     def set_gguf_parameters(self):
@@ -2542,8 +2598,8 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_add_bos_token(False)
 
 
-@Model.register("Phi3ForCausalLM")
-class Phi3MiniModel(Model):
+@ModelBase.register("Phi3ForCausalLM")
+class Phi3MiniModel(TextModel):
     model_arch = gguf.MODEL_ARCH.PHI3
 
     def set_vocab(self):
@@ -2720,7 +2776,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
         yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FACTORS_SHORT), torch.tensor(short_factors, dtype=torch.float32))
 
 
-@Model.register("PhiMoEForCausalLM")
+@ModelBase.register("PhiMoEForCausalLM")
 class PhiMoeModel(Phi3MiniModel):
     model_arch = gguf.MODEL_ARCH.PHIMOE
 
@@ -2777,8 +2833,8 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
-@Model.register("PlamoForCausalLM")
-class PlamoModel(Model):
+@ModelBase.register("PlamoForCausalLM")
+class PlamoModel(TextModel):
     model_arch = gguf.MODEL_ARCH.PLAMO
 
     def set_vocab(self):
@@ -2825,8 +2881,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(new_name, data_torch)]
 
 
-@Model.register("CodeShellForCausalLM")
-class CodeShellModel(Model):
+@ModelBase.register("CodeShellForCausalLM")
+class CodeShellModel(TextModel):
     model_arch = gguf.MODEL_ARCH.CODESHELL
 
     def set_gguf_parameters(self):
@@ -2866,8 +2922,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(new_name, data_torch)]
 
 
-@Model.register("InternLM2ForCausalLM")
-class InternLM2Model(Model):
+@ModelBase.register("InternLM2ForCausalLM")
+class InternLM2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.INTERNLM2
 
     def set_vocab(self):
@@ -3039,8 +3095,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("InternLM3ForCausalLM")
-class InternLM3Model(Model):
+@ModelBase.register("InternLM3ForCausalLM")
+class InternLM3Model(TextModel):
     model_arch = gguf.MODEL_ARCH.LLAMA
 
     def set_vocab(self):
@@ -3099,8 +3155,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("BertModel", "BertForMaskedLM", "CamembertModel")
-class BertModel(Model):
+@ModelBase.register("BertModel", "BertForMaskedLM", "CamembertModel")
+class BertModel(TextModel):
     model_arch = gguf.MODEL_ARCH.BERT
 
     def __init__(self, *args, **kwargs):
@@ -3187,7 +3243,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("RobertaModel")
+@ModelBase.register("RobertaModel")
 class RobertaModel(BertModel):
     model_arch = gguf.MODEL_ARCH.BERT
 
@@ -3232,7 +3288,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return super().modify_tensors(data_torch, name, bid)
 
 
-@Model.register("NomicBertModel")
+@ModelBase.register("NomicBertModel")
 class NomicBertModel(BertModel):
     model_arch = gguf.MODEL_ARCH.NOMIC_BERT
 
@@ -3262,7 +3318,7 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
 
 
-@Model.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
+@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
 class XLMRobertaModel(BertModel):
     model_arch = gguf.MODEL_ARCH.BERT
 
@@ -3373,8 +3429,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return super().modify_tensors(data_torch, name, bid)
 
 
-@Model.register("GemmaForCausalLM")
-class GemmaModel(Model):
+@ModelBase.register("GemmaForCausalLM")
+class GemmaModel(TextModel):
     model_arch = gguf.MODEL_ARCH.GEMMA
 
     def set_vocab(self):
@@ -3424,8 +3480,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("Gemma2ForCausalLM")
-class Gemma2Model(Model):
+@ModelBase.register("Gemma2ForCausalLM")
+class Gemma2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.GEMMA2
 
     def set_vocab(self):
@@ -3471,27 +3527,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
-class Gemma3Model(Model):
+@ModelBase.register("Gemma3ForCausalLM", "Gemma3ForConditionalGeneration")
+class Gemma3Model(TextModel):
     model_arch = gguf.MODEL_ARCH.GEMMA3
-    has_vision: bool = False
-
-    # we need to merge the text_config into the root level of hparams
-    def __init__(self, *args, **kwargs):
-        hparams = kwargs["hparams"] if "hparams" in kwargs else Model.load_hparams(args[0])
-        if "text_config" in hparams:
-            hparams = {**hparams, **hparams["text_config"]}
-            kwargs["hparams"] = hparams
-        super().__init__(*args, **kwargs)
-        if "vision_config" in hparams:
-            logger.info("Has vision encoder, but it will be ignored")
-            self.has_vision = True
-
-    def write(self):
-        super().write()
-        if self.has_vision:
-            logger.info("NOTE: this script only convert the language model to GGUF")
-            logger.info("      for the vision model, please use gemma3_convert_encoder_to_gguf.py")
 
     def set_vocab(self):
         self._set_vocab_sentencepiece()
@@ -3529,10 +3567,10 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         if name.startswith("language_model."):
             name = name.replace("language_model.", "")
+
         elif name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
-                or name.startswith("multimodal_projector.") or name.startswith("vision_model."): # this is for old HF model, should be removed later
-            # ignore vision tensors
-            return []
+                or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
+            return [] # skip vision tensors
 
         # remove OOV (out-of-vocabulary) rows in token_embd
         if "embed_tokens.weight" in name:
@@ -3548,13 +3586,58 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("Starcoder2ForCausalLM")
-class StarCoder2Model(Model):
+@ModelBase.register("Gemma3ForConditionalGeneration")
+class Gemma3VisionModel(VisionModel):
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        self.gguf_writer.add_string(gguf.Keys.ClipVision.PROJECTOR_TYPE, "gemma3")
+        # default values below are taken from HF tranformers code
+        self.gguf_writer.add_float32(gguf.Keys.ClipVision.Attention.LAYERNORM_EPS, hparams.get("layer_norm_eps", 1e-6))
+        self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_MEAN, [0.5, 0.5, 0.5])
+        self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_STD,  [0.5, 0.5, 0.5])
+        self.gguf_writer.add_bool (gguf.Keys.ClipVision.USE_GELU,   True)
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        del bid, new_name, n_dims  # unused
+        # related to https://github.com/ggml-org/llama.cpp/issues/13025
+        if "input_projection" in name:
+            return gguf.GGMLQuantizationType.F16
+        if ".embeddings." in name:
+            return gguf.GGMLQuantizationType.F32
+        return False
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+
+        if name.startswith("multi_modal_projector.") or name.startswith("vision_tower.") \
+                or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
+            # process vision tensors
+            name = name.replace("_weight", ".weight")
+            if "fc1" in name:
+                name = name.replace("fc1", "fc2")
+            else:
+                name = name.replace("fc2", "fc1")
+
+            # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
+            # the other norm values are part of SigLIP model, and they are already correct
+            # ref code: Gemma3RMSNorm
+            if "soft_emb_norm.weight" in name:
+                logger.info(f"Correcting norm value for '{name}'")
+                data_torch = data_torch + 1
+
+            return [(self.map_tensor_name(name), data_torch)]
+
+        return [] # skip other tensors
+
+
+@ModelBase.register("Starcoder2ForCausalLM")
+class StarCoder2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.STARCODER2
 
 
-@Model.register("Rwkv6ForCausalLM")
-class Rwkv6Model(Model):
+@ModelBase.register("Rwkv6ForCausalLM")
+class Rwkv6Model(TextModel):
     model_arch = gguf.MODEL_ARCH.RWKV6
 
     def set_vocab(self):
@@ -3626,7 +3709,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         yield (new_name, data_torch)
 
 
-@Model.register("RWKV6Qwen2ForCausalLM")
+@ModelBase.register("RWKV6Qwen2ForCausalLM")
 class RWKV6Qwen2Model(Rwkv6Model):
     model_arch = gguf.MODEL_ARCH.RWKV6QWEN2
 
@@ -3680,8 +3763,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             yield (new_name, data)
 
 
-@Model.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM")
-class Rwkv7Model(Model):
+@ModelBase.register("Rwkv7ForCausalLM", "RWKV7ForCausalLM")
+class Rwkv7Model(TextModel):
     model_arch = gguf.MODEL_ARCH.RWKV7
 
     def set_vocab(self):
@@ -3799,7 +3882,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             yield (new_name, data_torch)
 
 
-@Model.register("RwkvHybridForCausalLM")
+@ModelBase.register("RwkvHybridForCausalLM")
 class ARwkv7Model(Rwkv7Model):
     model_arch = gguf.MODEL_ARCH.ARWKV7
 
@@ -3842,8 +3925,8 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_head_count(0)
 
 
-@Model.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
-class MambaModel(Model):
+@ModelBase.register("MambaForCausalLM", "MambaLMHeadModel", "FalconMambaForCausalLM")
+class MambaModel(TextModel):
     model_arch = gguf.MODEL_ARCH.MAMBA
 
     def set_vocab(self):
@@ -3920,8 +4003,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(new_name, data_torch)]
 
 
-@Model.register("CohereForCausalLM")
-class CommandR2Model(Model):
+@ModelBase.register("CohereForCausalLM")
+class CommandR2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.COMMAND_R
 
     def __init__(self, *args, **kwargs):
@@ -3938,8 +4021,8 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
 
 
-@Model.register("Cohere2ForCausalLM")
-class Cohere2Model(Model):
+@ModelBase.register("Cohere2ForCausalLM")
+class Cohere2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.COHERE2
 
     def set_gguf_parameters(self):
@@ -3956,9 +4039,9 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
 
 
-@Model.register("OlmoForCausalLM")
-@Model.register("OLMoForCausalLM")
-class OlmoModel(Model):
+@ModelBase.register("OlmoForCausalLM")
+@ModelBase.register("OLMoForCausalLM")
+class OlmoModel(TextModel):
     model_arch = gguf.MODEL_ARCH.OLMO
 
     def set_gguf_parameters(self):
@@ -3984,13 +4067,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("Olmo2ForCausalLM")
-class Olmo2Model(Model):
+@ModelBase.register("Olmo2ForCausalLM")
+class Olmo2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.OLMO2
 
 
-@Model.register("OlmoeForCausalLM")
-class OlmoeModel(Model):
+@ModelBase.register("OlmoeForCausalLM")
+class OlmoeModel(TextModel):
     model_arch = gguf.MODEL_ARCH.OLMOE
 
     def set_gguf_parameters(self):
@@ -4049,7 +4132,7 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
-@Model.register("JinaBertModel", "JinaBertForMaskedLM")
+@ModelBase.register("JinaBertModel", "JinaBertForMaskedLM")
 class JinaBertV2Model(BertModel):
     model_arch = gguf.MODEL_ARCH.JINA_BERT_V2
 
@@ -4096,8 +4179,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return super().modify_tensors(data_torch, name, bid)
 
 
-@Model.register("OpenELMForCausalLM")
-class OpenELMModel(Model):
+@ModelBase.register("OpenELMForCausalLM")
+class OpenELMModel(TextModel):
     model_arch = gguf.MODEL_ARCH.OPENELM
 
     @staticmethod
@@ -4171,8 +4254,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         yield (self.map_tensor_name(name), data_torch)
 
 
-@Model.register("ArcticForCausalLM")
-class ArcticModel(Model):
+@ModelBase.register("ArcticForCausalLM")
+class ArcticModel(TextModel):
     model_arch = gguf.MODEL_ARCH.ARCTIC
 
     def set_vocab(self):
@@ -4322,8 +4405,8 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
-@Model.register("DeepseekForCausalLM")
-class DeepseekModel(Model):
+@ModelBase.register("DeepseekForCausalLM")
+class DeepseekModel(TextModel):
     model_arch = gguf.MODEL_ARCH.DEEPSEEK
 
     def set_vocab(self):
@@ -4413,9 +4496,9 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
-@Model.register("DeepseekV2ForCausalLM")
-@Model.register("DeepseekV3ForCausalLM")
-class DeepseekV2Model(Model):
+@ModelBase.register("DeepseekV2ForCausalLM")
+@ModelBase.register("DeepseekV3ForCausalLM")
+class DeepseekV2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.DEEPSEEK2
 
     def set_vocab(self):
@@ -4541,8 +4624,8 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
-@Model.register("PLMForCausalLM")
-class PLMModel(Model):
+@ModelBase.register("PLMForCausalLM")
+class PLMModel(TextModel):
     model_arch = gguf.MODEL_ARCH.PLM
 
     def set_vocab(self):
@@ -4564,11 +4647,11 @@ def prepare_tensors(self):
         super().prepare_tensors()
 
 
-@Model.register("T5WithLMHeadModel")
-@Model.register("T5ForConditionalGeneration")
-@Model.register("MT5ForConditionalGeneration")
-@Model.register("UMT5ForConditionalGeneration")
-class T5Model(Model):
+@ModelBase.register("T5WithLMHeadModel")
+@ModelBase.register("T5ForConditionalGeneration")
+@ModelBase.register("MT5ForConditionalGeneration")
+@ModelBase.register("UMT5ForConditionalGeneration")
+class T5Model(TextModel):
     model_arch = gguf.MODEL_ARCH.T5
 
     def __init__(self, *args, **kwargs):
@@ -4707,8 +4790,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("T5EncoderModel")
-class T5EncoderModel(Model):
+@ModelBase.register("T5EncoderModel")
+class T5EncoderModel(TextModel):
     model_arch = gguf.MODEL_ARCH.T5ENCODER
 
     def __init__(self, *args, **kwargs):
@@ -4846,8 +4929,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("JAISLMHeadModel")
-class JaisModel(Model):
+@ModelBase.register("JAISLMHeadModel")
+class JaisModel(TextModel):
     model_arch = gguf.MODEL_ARCH.JAIS
 
     def __init__(self, *args, **kwargs):
@@ -4929,8 +5012,8 @@ def prepare_tensors(self):
         self.gguf_writer.add_max_alibi_bias(self.max_alibi_bias)
 
 
-@Model.register("Glm4ForCausalLM")
-class Glm4Model(Model):
+@ModelBase.register("Glm4ForCausalLM")
+class Glm4Model(TextModel):
     model_arch = gguf.MODEL_ARCH.GLM4
 
     def set_vocab(self):
@@ -4945,8 +5028,8 @@ def set_gguf_parameters(self):
                 self.gguf_writer.add_rope_scaling_orig_ctx_len(self.hparams["rope_scaling"]["original_max_position_embeddings"])
 
 
-@Model.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
-class ChatGLMModel(Model):
+@ModelBase.register("GlmForCausalLM", "ChatGLMModel", "ChatGLMForConditionalGeneration")
+class ChatGLMModel(TextModel):
     model_arch = gguf.MODEL_ARCH.CHATGLM
 
     def set_vocab_chatglm3(self):
@@ -5100,8 +5183,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("NemotronForCausalLM")
-class NemotronModel(Model):
+@ModelBase.register("NemotronForCausalLM")
+class NemotronModel(TextModel):
     model_arch = gguf.MODEL_ARCH.NEMOTRON
 
     def set_vocab(self):
@@ -5141,8 +5224,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return [(self.map_tensor_name(name), data_torch)]
 
 
-@Model.register("ExaoneForCausalLM")
-class ExaoneModel(Model):
+@ModelBase.register("ExaoneForCausalLM")
+class ExaoneModel(TextModel):
     model_arch = gguf.MODEL_ARCH.EXAONE
 
     def set_gguf_parameters(self):
@@ -5210,7 +5293,7 @@ def generate_extra_tensors(self) -> Iterable[tuple[str, Tensor]]:
                 yield (self.format_tensor_name(gguf.MODEL_TENSOR.ROPE_FREQS), torch.tensor(rope_factors, dtype=torch.float32))
 
 
-@Model.register("GraniteForCausalLM")
+@ModelBase.register("GraniteForCausalLM")
 class GraniteModel(LlamaModel):
     """Conversion for IBM's GraniteForCausalLM"""
     model_arch = gguf.MODEL_ARCH.GRANITE
@@ -5244,7 +5327,7 @@ def set_gguf_parameters(self):
             logger.info("gguf: (granite) logits_scale = %s", logits_scale)
 
 
-@Model.register("GraniteMoeForCausalLM")
+@ModelBase.register("GraniteMoeForCausalLM")
 class GraniteMoeModel(GraniteModel):
     """Conversion for IBM's GraniteMoeForCausalLM"""
     model_arch = gguf.MODEL_ARCH.GRANITE_MOE
@@ -5268,8 +5351,8 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         return super().modify_tensors(data_torch, name, bid)
 
 
-@Model.register("BailingMoeForCausalLM")
-class BailingMoeModel(Model):
+@ModelBase.register("BailingMoeForCausalLM")
+class BailingMoeModel(TextModel):
     model_arch = gguf.MODEL_ARCH.BAILINGMOE
 
     def set_vocab(self):
@@ -5367,9 +5450,9 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
-@Model.register("ChameleonForConditionalGeneration")
-@Model.register("ChameleonForCausalLM")  # obsolete
-class ChameleonModel(Model):
+@ModelBase.register("ChameleonForConditionalGeneration")
+@ModelBase.register("ChameleonForCausalLM")  # obsolete
+class ChameleonModel(TextModel):
     model_arch = gguf.MODEL_ARCH.CHAMELEON
 
     def set_gguf_parameters(self):
@@ -5554,6 +5637,10 @@ def parse_args() -> argparse.Namespace:
         "--remote", action="store_true",
         help="(Experimental) Read safetensors file remotely without downloading to disk. Config and tokenizer files will still be downloaded. To use this feature, you need to specify Hugging Face model repo name instead of a local directory. For example: 'HuggingFaceTB/SmolLM2-1.7B-Instruct'. Note: To access gated repo, set HF_TOKEN environment variable to your Hugging Face token.",
     )
+    parser.add_argument(
+        "--mmproj", action="store_true",
+        help="(Experimental) Export multimodal projector (mmproj) for vision models. This will only work on some vision models. A prefix 'mmproj-' will be added to the output file name.",
+    )
 
     args = parser.parse_args()
     if not args.print_supported_models and args.model is None:
@@ -5584,7 +5671,7 @@ def main() -> None:
 
     if args.print_supported_models:
         logger.error("Supported models:")
-        Model.print_registered_models()
+        ModelBase.print_registered_models()
         sys.exit(0)
 
     if args.verbose:
@@ -5631,13 +5718,18 @@ def main() -> None:
 
     logger.info(f"Loading model: {dir_model.name}")
 
-    hparams = Model.load_hparams(dir_model)
+    hparams = ModelBase.load_hparams(dir_model)
+
+    if args.mmproj:
+        if "mmproj" not in fname_out.name:
+            fname_out = ModelBase.add_prefix_to_filename(fname_out, "mmproj-")
 
     with torch.inference_mode():
         output_type = ftype_map[args.outtype]
         model_architecture = hparams["architectures"][0]
+        model_type = ModelType.VISION if args.mmproj else ModelType.TEXT
         try:
-            model_class = Model.from_model_architecture(model_architecture)
+            model_class = ModelBase.from_model_architecture(model_architecture, model_type=model_type)
         except NotImplementedError:
             logger.error(f"Model {model_architecture} is not supported")
             sys.exit(1)
diff --git a/convert_lora_to_gguf.py b/convert_lora_to_gguf.py
index bdc991533b4e0..00a6733cbd360 100755
--- a/convert_lora_to_gguf.py
+++ b/convert_lora_to_gguf.py
@@ -24,7 +24,7 @@
 import gguf
 
 # reuse model definitions from convert_hf_to_gguf.py
-from convert_hf_to_gguf import LazyTorchTensor, Model
+from convert_hf_to_gguf import LazyTorchTensor, ModelBase
 
 logger = logging.getLogger("lora-to-gguf")
 
@@ -340,11 +340,11 @@ def load_hparams_from_hf(hf_model_id: str) -> dict[str, Any]:
             sys.exit(1)
     else:
         logger.info(f"Loading base model: {dir_base_model.name}")
-        hparams = Model.load_hparams(dir_base_model)
+        hparams = ModelBase.load_hparams(dir_base_model)
 
     with torch.inference_mode():
         try:
-            model_class = Model.from_model_architecture(hparams["architectures"][0])
+            model_class = ModelBase.from_model_architecture(hparams["architectures"][0])
         except NotImplementedError:
             logger.error(f"Model {hparams['architectures'][0]} is not supported")
             sys.exit(1)
diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h
index 4d7340a56bd0c..180ae9880b124 100644
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@@ -50,7 +50,6 @@
 // tensor name constants
 //
 
-#define TN_TOKEN_EMBD      "%s.token_embd.weight"
 #define TN_POS_EMBD        "%s.position_embd.weight"
 #define TN_CLASS_EMBD      "v.class_embd"
 #define TN_PATCH_EMBD      "v.patch_embd.weight"  // not rename tensor with ".0" postfix for backwrad compat
@@ -66,8 +65,6 @@
 #define TN_LN_2            "%s.blk.%d.ln2.%s"
 #define TN_LN_PRE          "%s.pre_ln.%s"
 #define TN_LN_POST         "%s.post_ln.%s"
-#define TN_TEXT_PROJ       "text_projection.weight"
-#define TN_VIS_PROJ        "visual_projection.weight"
 #define TN_LLAVA_PROJ      "mm.%d.%s"
 #define TN_MVLM_PROJ_MLP   "mm.model.mlp.%d.%s"
 #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s"
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 8fcde2626aa7c..3f24705201d93 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -218,17 +218,37 @@ class Adapter:
         TYPE       = "adapter.type"
         LORA_ALPHA = "adapter.lora.alpha"
 
+    class ClipVision:
+        PROJECTOR_TYPE      = "clip.projector_type"
+        HAS_VISION_ENCODER  = "clip.has_vision_encoder"
+        HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
+        IMAGE_SIZE          = "clip.vision.image_size"
+        PATCH_SIZE          = "clip.vision.patch_size"
+        EMBEDDING_LENGTH    = "clip.vision.embedding_length"
+        FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length"
+        PROJECTION_DIM      = "clip.vision.projection_dim"
+        BLOCK_COUNT         = "clip.vision.block_count"
+        IMAGE_MEAN          = "clip.vision.image_mean"
+        IMAGE_STD           = "clip.vision.image_std"
+        USE_GELU            = "clip.use_gelu"
+
+        class Attention:
+            HEAD_COUNT      = "clip.vision.attention.head_count"
+            LAYERNORM_EPS   = "clip.vision.attention.layer_norm_epsilon"
+
 #
 # recommended mapping of model tensor names for storage in gguf
 #
 
 
 class GGUFType:
-    MODEL   = "model"
-    ADAPTER = "adapter"
+    MODEL       = "model"
+    ADAPTER     = "adapter"
+    CLIP_VISION = "clip-vision"
 
 
 class MODEL_ARCH(IntEnum):
+    CLIP_VISION      = auto() # dummy arch for clip.cpp
     LLAMA            = auto()
     LLAMA4           = auto()
     DECI             = auto()
@@ -297,6 +317,16 @@ class MODEL_ARCH(IntEnum):
     BAILINGMOE       = auto()
 
 
+class VISION_PROJECTOR_TYPE(IntEnum):
+    MLP       = auto()
+    LDP       = auto()
+    LDPV2     = auto()
+    RESAMPLER = auto()
+    GLM_EDGE  = auto()
+    MERGER    = auto()
+    GEMMA3    = auto()
+
+
 class MODEL_TENSOR(IntEnum):
     TOKEN_EMBD           = auto()
     TOKEN_EMBD_NORM      = auto()
@@ -436,9 +466,41 @@ class MODEL_TENSOR(IntEnum):
     POSNET_ATTN_K        = auto()
     POSNET_ATTN_V        = auto()
     POSNET_ATTN_OUT      = auto()
+    # vision
+    V_MMPROJ             = auto()
+    V_MMPROJ_FC          = auto()
+    V_MMPROJ_MLP         = auto()
+    V_MMPROJ_PEG         = auto()
+    V_ENC_EMBD_CLS       = auto()
+    V_ENC_EMBD_PATCH     = auto()
+    V_ENC_EMBD_POS       = auto()
+    V_ENC_ATTN_Q         = auto()
+    V_ENC_ATTN_K         = auto()
+    V_ENC_ATTN_V         = auto()
+    V_ENC_INPUT_NORM     = auto()
+    V_ENC_OUTPUT         = auto()
+    V_ENC_OUTPUT_NORM    = auto()
+    V_ENC_FFN_UP         = auto()
+    V_ENC_FFN_DOWN       = auto()
+    V_PRE_NORM           = auto()
+    V_POST_NORM          = auto()
+    V_MM_INP_PROJ        = auto() # gemma3
+    V_MM_SOFT_EMB_NORM   = auto() # gemma3
+    V_RESMPL_POS_EMBD_K  = auto() # minicpmv
+    V_RESMPL_ATTN_Q      = auto() # minicpmv
+    V_RESMPL_ATTN_K      = auto() # minicpmv
+    V_RESMPL_ATTN_V      = auto() # minicpmv
+    V_RESMPL_ATTN_OUT    = auto() # minicpmv
+    V_RESMPL_KV          = auto() # minicpmv
+    V_RESMPL_KV_NORM     = auto() # minicpmv
+    V_RESMPL_POST_NORM   = auto() # minicpmv
+    V_RESMPL_Q_NORM      = auto() # minicpmv
+    V_RESMPL_PROJ        = auto() # minicpmv
+    V_RESMPL_QUERY       = auto() # minicpmv
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
+    MODEL_ARCH.CLIP_VISION:      "clip", # dummy arch for clip.cpp
     MODEL_ARCH.LLAMA:            "llama",
     MODEL_ARCH.LLAMA4:           "llama4",
     MODEL_ARCH.DECI:             "deci",
@@ -507,6 +569,16 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.BAILINGMOE:       "bailingmoe",
 }
 
+VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
+    VISION_PROJECTOR_TYPE.MLP:       "mlp",
+    VISION_PROJECTOR_TYPE.LDP:       "ldp",
+    VISION_PROJECTOR_TYPE.LDPV2:     "ldpv2",
+    VISION_PROJECTOR_TYPE.RESAMPLER: "resampler",
+    VISION_PROJECTOR_TYPE.GLM_EDGE:  "adapter",
+    VISION_PROJECTOR_TYPE.MERGER:    "qwen2vl_merger",
+    VISION_PROJECTOR_TYPE.GEMMA3:    "gemma3",
+}
+
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
     MODEL_TENSOR.TOKEN_EMBD:                "token_embd",
     MODEL_TENSOR.TOKEN_EMBD_NORM:           "token_embd_norm",
@@ -646,9 +718,72 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.POSNET_ATTN_K:             "posnet.{bid}.attn_k",
     MODEL_TENSOR.POSNET_ATTN_V:             "posnet.{bid}.attn_v",
     MODEL_TENSOR.POSNET_ATTN_OUT:           "posnet.{bid}.attn_output",
+    # vision
+    MODEL_TENSOR.V_MMPROJ:                  "mm.{bid}",
+    MODEL_TENSOR.V_MMPROJ_FC:               "mm.model.fc",
+    MODEL_TENSOR.V_MMPROJ_MLP:              "mm.model.mlp.{bid}",
+    MODEL_TENSOR.V_MMPROJ_PEG:              "mm.model.peg.{bid}",
+    MODEL_TENSOR.V_ENC_EMBD_CLS:            "v.class_embd",
+    MODEL_TENSOR.V_ENC_EMBD_PATCH:          "v.patch_embd",
+    MODEL_TENSOR.V_ENC_EMBD_POS:            "v.position_embd",
+    MODEL_TENSOR.V_ENC_ATTN_Q:              "v.blk.{bid}.attn_q",
+    MODEL_TENSOR.V_ENC_ATTN_K:              "v.blk.{bid}.attn_k",
+    MODEL_TENSOR.V_ENC_ATTN_V:              "v.blk.{bid}.attn_v",
+    MODEL_TENSOR.V_ENC_INPUT_NORM:          "v.blk.{bid}.ln1",
+    MODEL_TENSOR.V_ENC_OUTPUT:              "v.blk.{bid}.attn_out",
+    MODEL_TENSOR.V_ENC_OUTPUT_NORM:         "v.blk.{bid}.ln2",
+    MODEL_TENSOR.V_ENC_FFN_UP:              "v.blk.{bid}.ffn_up",
+    MODEL_TENSOR.V_ENC_FFN_DOWN:            "v.blk.{bid}.ffn_down",
+    MODEL_TENSOR.V_PRE_NORM:                "v.pre_ln",
+    MODEL_TENSOR.V_POST_NORM:               "v.post_ln",
+    MODEL_TENSOR.V_MM_INP_PROJ:             "mm.input_projection",
+    MODEL_TENSOR.V_MM_SOFT_EMB_NORM:        "mm.soft_emb_norm",
+    MODEL_TENSOR.V_RESMPL_POS_EMBD_K:       "resampler.pos_embd_k",
+    MODEL_TENSOR.V_RESMPL_ATTN_Q:           "resampler.attn.q",
+    MODEL_TENSOR.V_RESMPL_ATTN_K:           "resampler.attn.k",
+    MODEL_TENSOR.V_RESMPL_ATTN_V:           "resampler.attn.v",
+    MODEL_TENSOR.V_RESMPL_ATTN_OUT:         "resampler.attn.out",
+    MODEL_TENSOR.V_RESMPL_KV:               "resampler.kv",
+    MODEL_TENSOR.V_RESMPL_KV_NORM:          "resampler.ln_kv",
+    MODEL_TENSOR.V_RESMPL_POST_NORM:        "resampler.ln_post",
+    MODEL_TENSOR.V_RESMPL_Q_NORM:           "resampler.ln_q",
+    MODEL_TENSOR.V_RESMPL_PROJ:             "resampler.proj",
+    MODEL_TENSOR.V_RESMPL_QUERY:            "resampler.query",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
+    MODEL_ARCH.CLIP_VISION: [
+        MODEL_TENSOR.V_MMPROJ,
+        MODEL_TENSOR.V_MMPROJ_FC,
+        MODEL_TENSOR.V_MMPROJ_MLP,
+        MODEL_TENSOR.V_MMPROJ_PEG,
+        MODEL_TENSOR.V_ENC_EMBD_CLS,
+        MODEL_TENSOR.V_ENC_EMBD_PATCH,
+        MODEL_TENSOR.V_ENC_EMBD_POS,
+        MODEL_TENSOR.V_ENC_ATTN_Q,
+        MODEL_TENSOR.V_ENC_ATTN_K,
+        MODEL_TENSOR.V_ENC_ATTN_V,
+        MODEL_TENSOR.V_ENC_INPUT_NORM,
+        MODEL_TENSOR.V_ENC_OUTPUT,
+        MODEL_TENSOR.V_ENC_OUTPUT_NORM,
+        MODEL_TENSOR.V_ENC_FFN_UP,
+        MODEL_TENSOR.V_ENC_FFN_DOWN,
+        MODEL_TENSOR.V_PRE_NORM,
+        MODEL_TENSOR.V_POST_NORM,
+        MODEL_TENSOR.V_MM_INP_PROJ,
+        MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
+        MODEL_TENSOR.V_RESMPL_POS_EMBD_K,
+        MODEL_TENSOR.V_RESMPL_ATTN_Q,
+        MODEL_TENSOR.V_RESMPL_ATTN_K,
+        MODEL_TENSOR.V_RESMPL_ATTN_V,
+        MODEL_TENSOR.V_RESMPL_ATTN_OUT,
+        MODEL_TENSOR.V_RESMPL_KV,
+        MODEL_TENSOR.V_RESMPL_KV_NORM,
+        MODEL_TENSOR.V_RESMPL_POST_NORM,
+        MODEL_TENSOR.V_RESMPL_Q_NORM,
+        MODEL_TENSOR.V_RESMPL_PROJ,
+        MODEL_TENSOR.V_RESMPL_QUERY,
+    ],
     MODEL_ARCH.LLAMA: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 0bc75cf513a9f..22066b2868019 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -886,6 +886,150 @@ class TensorNameMap:
         MODEL_TENSOR.POSNET_ATTN_OUT: (
             "backbone.posnet.{bid}.proj_out", # wavtokenizer
         ),
+
+        #############################################################################
+        ## Vision encoder
+
+        MODEL_TENSOR.V_MMPROJ: (
+            "multi_modal_projector.linear_{bid}",
+        ),
+
+        MODEL_TENSOR.V_MMPROJ_FC: (
+            "model.connector.modality_projection.proj", # SmolVLM
+        ),
+
+        MODEL_TENSOR.V_MMPROJ_MLP: (
+            "model.mm_projector.mlp.mlp.{bid}",
+        ),
+
+        MODEL_TENSOR.V_MMPROJ_PEG: (
+            "model.mm_projector.peg.peg.{bid}",
+        ),
+
+        MODEL_TENSOR.V_ENC_EMBD_CLS: (
+            "vision_tower.vision_model.embeddings.class_embedding",
+        ),
+
+        MODEL_TENSOR.V_ENC_EMBD_PATCH: (
+            "vision_tower.vision_model.embeddings.patch_embedding",
+            "vpm.embeddings.patch_embedding",
+            "model.vision_model.embeddings.patch_embedding", # SmolVLM
+        ),
+
+        MODEL_TENSOR.V_ENC_EMBD_POS: (
+            "vision_tower.vision_model.embeddings.position_embedding",
+            "vpm.embeddings.position_embedding",
+            "model.vision_model.embeddings.position_embedding", # SmolVLM
+        ),
+
+        MODEL_TENSOR.V_ENC_ATTN_Q: (
+            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
+            "vpm.encoder.layers.{bid}.self_attn.q_proj",
+            "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
+        ),
+
+        MODEL_TENSOR.V_ENC_ATTN_K: (
+            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
+            "vpm.encoder.layers.{bid}.self_attn.k_proj",
+            "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
+        ),
+
+        MODEL_TENSOR.V_ENC_ATTN_V: (
+            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
+            "vpm.encoder.layers.{bid}.self_attn.v_proj",
+            "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
+        ),
+
+        MODEL_TENSOR.V_ENC_INPUT_NORM: (
+            "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
+            "vpm.encoder.layers.{bid}.layer_norm1",
+            "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
+        ),
+
+        MODEL_TENSOR.V_ENC_OUTPUT: (
+            "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
+            "vpm.encoder.layers.{bid}.self_attn.out_proj",
+            "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
+        ),
+
+        MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
+            "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
+            "vpm.encoder.layers.{bid}.layer_norm2",
+            "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
+        ),
+
+        MODEL_TENSOR.V_ENC_FFN_UP: (
+            "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
+            "vpm.encoder.layers.{bid}.mlp.fc1",
+            "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM
+        ),
+
+        MODEL_TENSOR.V_ENC_FFN_DOWN: (
+            "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
+            "vpm.encoder.layers.{bid}.mlp.fc2",
+            "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM
+        ),
+
+        MODEL_TENSOR.V_PRE_NORM: (
+            "vision_tower.vision_model.pre_layrnorm",
+        ),
+
+        MODEL_TENSOR.V_POST_NORM: (
+            "vision_tower.vision_model.post_layernorm",
+            "model.vision_model.post_layernorm", # SmolVLM
+        ),
+
+        MODEL_TENSOR.V_MM_INP_PROJ: (
+            "multi_modal_projector.mm_input_projection",
+        ),
+
+        MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
+            "multi_modal_projector.mm_soft_emb_norm",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_POS_EMBD_K: (
+            "resampler.pos_embed_k",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_ATTN_Q: (
+            "resampler.attn.in_proj_q", # tensor generated from resampler.attn.in_proj
+        ),
+
+        MODEL_TENSOR.V_RESMPL_ATTN_K: (
+            "resampler.attn.in_proj_k", # tensor generated from resampler.attn.in_proj
+        ),
+
+        MODEL_TENSOR.V_RESMPL_ATTN_V: (
+            "resampler.attn.in_proj_v", # tensor generated from resampler.attn.in_proj
+        ),
+
+        MODEL_TENSOR.V_RESMPL_ATTN_OUT: (
+            "resampler.attn.out_proj",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_KV: (
+            "resampler.kv_proj",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_POST_NORM: (
+            "resampler.ln_post",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_KV_NORM: (
+            "resampler.ln_kv",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_Q_NORM: (
+            "resampler.ln_q",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_PROJ: (
+            "resampler.proj",
+        ),
+
+        MODEL_TENSOR.V_RESMPL_QUERY: (
+            "resampler.query",
+        ),
     }
 
     # architecture-specific block mappings

From 84a9bf2fc2875205f0806fbbfbb66dc67204094c Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Mon, 21 Apr 2025 15:32:58 +0200
Subject: [PATCH 005/200] mtmd : merge llava, gemma3 and minicpmv CLI into
 single `llama-mtmd-cli` (#13012)

* mtmd : merge `llava-cli` and `gemma3-cli` into single `mtmd-cli`

* support for minicpmv

* remove cpp files of llava and minicpmv

* update hot topics

* mtmd : add not supported msg for qwen2vl

* Update examples/llava/mtmd.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 README.md                                     |   1 +
 common/arg.cpp                                |   2 +-
 examples/llava/CMakeLists.txt                 |  22 +-
 examples/llava/deprecation-warning.cpp        |  22 ++
 examples/llava/llava-cli.cpp                  | 332 ----------------
 examples/llava/minicpmv-cli.cpp               | 354 ------------------
 .../llava/{gemma3-cli.cpp => mtmd-cli.cpp}    |  50 ++-
 examples/llava/mtmd.cpp                       | 311 ++++++++++++---
 examples/llava/tests.sh                       |  34 +-
 src/llama-chat.cpp                            |   2 +
 10 files changed, 359 insertions(+), 771 deletions(-)
 create mode 100644 examples/llava/deprecation-warning.cpp
 delete mode 100644 examples/llava/llava-cli.cpp
 delete mode 100644 examples/llava/minicpmv-cli.cpp
 rename examples/llava/{gemma3-cli.cpp => mtmd-cli.cpp} (82%)

diff --git a/README.md b/README.md
index cf45f23cf4475..a0e7bd2d213ed 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 
 ## Hot topics
 
+- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated
 - **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
 - **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
 - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
diff --git a/common/arg.cpp b/common/arg.cpp
index 0b57f9da1eec2..80c318a0e50d0 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2726,7 +2726,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         [](common_params & params, const std::string & value) {
             params.chat_template = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
     add_opt(common_arg(
         {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
         string_format(
diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index 2d5061de460c0..6409b4f5e6cd0 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -61,19 +61,9 @@ if(TARGET BUILD_INFO)
     add_dependencies(mtmd BUILD_INFO)
 endif()
 
-set(TARGET llama-llava-cli)
-add_executable(${TARGET} llava-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-llava-cli)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
-
-set(TARGET llama-minicpmv-cli)
-add_executable(${TARGET} minicpmv-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-minicpmv-cli)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
+add_executable(llama-llava-cli    deprecation-warning.cpp)
+add_executable(llama-gemma3-cli   deprecation-warning.cpp)
+add_executable(llama-minicpmv-cli deprecation-warning.cpp)
 
 set(TARGET llama-qwen2vl-cli)
 add_executable(${TARGET} qwen2vl-cli.cpp)
@@ -82,9 +72,9 @@ install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
 
-set(TARGET llama-gemma3-cli)
-add_executable(${TARGET} gemma3-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-gemma3-cli)
+set(TARGET llama-mtmd-cli)
+add_executable(${TARGET} mtmd-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
 install(TARGETS ${TARGET} RUNTIME)
 target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
 target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/llava/deprecation-warning.cpp b/examples/llava/deprecation-warning.cpp
new file mode 100644
index 0000000000000..dded0a56af96b
--- /dev/null
+++ b/examples/llava/deprecation-warning.cpp
@@ -0,0 +1,22 @@
+#include <cstdio>
+#include <string>
+
+int main(int argc, char** argv) {
+    std::string filename = "main";
+    if (argc >= 1) {
+        filename = argv[0];
+    }
+
+    // Get only the program name from the full path
+    size_t pos = filename.find_last_of("/\\");
+    if (pos != std::string::npos) {
+        filename = filename.substr(pos+1);
+    }
+
+    fprintf(stdout, "\n");
+    fprintf(stdout, "WARNING: The binary '%s' is deprecated.\n", filename.c_str());
+    fprintf(stdout, "Please use 'llama-mtmd-cli' instead.\n");
+    fprintf(stdout, "\n");
+
+    return EXIT_FAILURE;
+}
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
deleted file mode 100644
index 0fe0e333a523d..0000000000000
--- a/examples/llava/llava-cli.cpp
+++ /dev/null
@@ -1,332 +0,0 @@
-#include "arg.h"
-#include "base64.hpp"
-#include "log.h"
-#include "common.h"
-#include "sampling.h"
-#include "clip.h"
-#include "llava.h"
-#include "llama.h"
-#include "ggml.h"
-
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <vector>
-
-static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
-    int N = (int) tokens.size();
-    for (int i = 0; i < N; i += n_batch) {
-        int n_eval = (int) tokens.size() - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
-            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
-            return false;
-        }
-        *n_past += n_eval;
-    }
-    return true;
-}
-
-static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
-    std::vector<llama_token> tokens;
-    tokens.push_back(id);
-    return eval_tokens(ctx_llama, tokens, 1, n_past);
-}
-
-static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
-    std::string              str2     = str;
-    std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
-    eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
-    return true;
-}
-
-static const char * sample(struct common_sampler * smpl,
-                           struct llama_context * ctx_llama,
-                           int * n_past) {
-    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
-    common_sampler_accept(smpl, id, true);
-
-    const llama_model * model = llama_get_model(ctx_llama);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    static std::string ret;
-    if (llama_vocab_is_eog(vocab, id)) {
-        ret = "</s>";
-    } else {
-        ret = common_token_to_piece(ctx_llama, id);
-    }
-    eval_id(ctx_llama, id, n_past);
-    return ret.c_str();
-}
-
-static const char* IMG_BASE64_TAG_BEGIN = "<img src=\"data:image/jpeg;base64,";
-static const char* IMG_BASE64_TAG_END = "\">";
-
-static void find_image_tag_in_prompt(const std::string& prompt, size_t& begin_out, size_t& end_out) {
-    begin_out = prompt.find(IMG_BASE64_TAG_BEGIN);
-    end_out = prompt.find(IMG_BASE64_TAG_END, (begin_out == std::string::npos) ? 0UL : begin_out);
-}
-
-static bool prompt_contains_image(const std::string& prompt) {
-    size_t begin, end;
-    find_image_tag_in_prompt(prompt, begin, end);
-    return (begin != std::string::npos);
-}
-
-// replaces the base64 image tag in the prompt with `replacement`
-static llava_image_embed * llava_image_embed_make_with_prompt_base64(struct clip_ctx * ctx_clip, int n_threads, const std::string& prompt) {
-    size_t img_base64_str_start, img_base64_str_end;
-    find_image_tag_in_prompt(prompt, img_base64_str_start, img_base64_str_end);
-    if (img_base64_str_start == std::string::npos || img_base64_str_end == std::string::npos) {
-        LOG_ERR("%s: invalid base64 image tag. must be %s<base64 byte string>%s\n", __func__, IMG_BASE64_TAG_BEGIN, IMG_BASE64_TAG_END);
-        return NULL;
-    }
-
-    auto base64_bytes_start = img_base64_str_start + strlen(IMG_BASE64_TAG_BEGIN);
-    auto base64_bytes_count = img_base64_str_end - base64_bytes_start;
-    auto base64_str = prompt.substr(base64_bytes_start, base64_bytes_count );
-
-    auto required_bytes = base64::required_encode_size(base64_str.size());
-    auto img_bytes = std::vector<unsigned char>(required_bytes);
-    base64::decode(base64_str.begin(), base64_str.end(), img_bytes.begin());
-
-    auto embed = llava_image_embed_make_with_bytes(ctx_clip, n_threads, img_bytes.data(), img_bytes.size());
-    if (!embed) {
-        LOG_ERR("%s: could not load image from base64 string.\n", __func__);
-        return NULL;
-    }
-
-    return embed;
-}
-
-static std::string remove_image_from_prompt(const std::string& prompt, const char * replacement = "") {
-    size_t begin, end;
-    find_image_tag_in_prompt(prompt, begin, end);
-    if (begin == std::string::npos || end == std::string::npos) {
-        return prompt;
-    }
-    auto pre = prompt.substr(0, begin);
-    auto post = prompt.substr(end + strlen(IMG_BASE64_TAG_END));
-    return pre + replacement + post;
-}
-
-struct llava_context {
-    struct clip_ctx * ctx_clip = NULL;
-    struct llama_context * ctx_llama = NULL;
-    struct llama_model * model = NULL;
-};
-
-static void print_usage(int, char ** argv) {
-    LOG("\n example usage:\n");
-    LOG("\n     %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG("\n note: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
-
-static struct llava_image_embed * load_image(llava_context * ctx_llava, common_params * params, const std::string & fname) {
-
-    // load and preprocess the image
-    llava_image_embed * embed = NULL;
-    auto prompt = params->prompt;
-    if (prompt_contains_image(prompt)) {
-        if (!params->image.empty()) {
-            LOG_INF("using base64 encoded image instead of command line image path\n");
-        }
-        embed = llava_image_embed_make_with_prompt_base64(ctx_llava->ctx_clip, params->cpuparams.n_threads, prompt);
-        if (!embed) {
-            LOG_ERR("%s: can't load image from prompt\n", __func__);
-            return NULL;
-        }
-        params->prompt = remove_image_from_prompt(prompt);
-    } else {
-        embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->cpuparams.n_threads, fname.c_str());
-        if (!embed) {
-            fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
-            return NULL;
-        }
-    }
-
-    return embed;
-}
-
-static void process_prompt(struct llava_context * ctx_llava, struct llava_image_embed * image_embed, common_params * params, const std::string & prompt) {
-    int n_past = 0;
-
-    const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
-
-    std::string system_prompt, user_prompt;
-    size_t image_pos = prompt.find("<image>");
-    if (image_pos != std::string::npos) {
-        // new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
-        system_prompt = prompt.substr(0, image_pos);
-        user_prompt = prompt.substr(image_pos + std::string("<image>").length());
-        LOG_INF("system_prompt: %s\n", system_prompt.c_str());
-        if (params->verbose_prompt) {
-            auto tmp = common_tokenize(ctx_llava->ctx_llama, system_prompt, true, true);
-            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
-        LOG_INF("user_prompt: %s\n", user_prompt.c_str());
-        if (params->verbose_prompt) {
-            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
-            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
-    } else {
-        // llava-1.5 native mode
-        system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
-        user_prompt = prompt + "\nASSISTANT:";
-        if (params->verbose_prompt) {
-            auto tmp = common_tokenize(ctx_llava->ctx_llama, user_prompt, true, true);
-            for (int i = 0; i < (int) tmp.size(); i++) {
-                LOG_INF("%6d -> '%s'\n", tmp[i], common_token_to_piece(ctx_llava->ctx_llama, tmp[i]).c_str());
-            }
-        }
-    }
-
-    eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, true);
-    llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
-    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
-
-    // generate the response
-
-    LOG("\n");
-
-    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
-    if (!smpl) {
-        LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
-        exit(1);
-    }
-
-    std::string response = "";
-    for (int i = 0; i < max_tgt_len; i++) {
-        const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
-        response += tmp;
-        if (strcmp(tmp, "</s>") == 0) break;
-        if (strstr(tmp, "###")) break; // Yi-VL behavior
-        LOG("%s", tmp);
-        if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
-        if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
-        if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
-
-        fflush(stdout);
-    }
-
-    common_sampler_free(smpl);
-    LOG("\n");
-}
-
-static struct llama_model * llava_init(common_params * params) {
-    llama_backend_init();
-    llama_numa_init(params->numa);
-
-    llama_model_params model_params = common_model_params_to_llama(*params);
-
-    llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
-    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n" , __func__);
-        return NULL;
-    }
-    return model;
-}
-
-static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
-    const char * clip_path = params->mmproj.path.c_str();
-
-    auto prompt = params->prompt;
-    if (prompt.empty()) {
-        prompt = "describe the image in detail.";
-    }
-
-    auto ctx_clip = clip_model_load(clip_path, GGML_LOG_LEVEL_INFO);
-
-    llama_context_params ctx_params = common_context_params_to_llama(*params);
-    ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
-
-    llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
-
-    if (ctx_llama == NULL) {
-        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
-        return NULL;
-    }
-
-    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
-
-    ctx_llava->ctx_llama = ctx_llama;
-    ctx_llava->ctx_clip = ctx_clip;
-    ctx_llava->model = model;
-    return ctx_llava;
-}
-
-static void llava_free(struct llava_context * ctx_llava) {
-    if (ctx_llava->ctx_clip) {
-        clip_free(ctx_llava->ctx_clip);
-        ctx_llava->ctx_clip = NULL;
-    }
-
-    llama_free(ctx_llava->ctx_llama);
-    llama_model_free(ctx_llava->model);
-    llama_backend_free();
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, print_usage)) {
-        return 1;
-    }
-
-    common_init();
-
-    if (params.mmproj.path.empty() || (params.image.empty() && !prompt_contains_image(params.prompt))) {
-        print_usage(argc, argv);
-        return 1;
-    }
-
-    auto * model = llava_init(&params);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
-        return 1;
-    }
-
-    if (prompt_contains_image(params.prompt)) {
-        auto * ctx_llava = llava_init_context(&params, model);
-
-        auto * image_embed = load_image(ctx_llava, &params, "");
-
-        // process the prompt
-        process_prompt(ctx_llava, image_embed, &params, params.prompt);
-
-        llama_perf_context_print(ctx_llava->ctx_llama);
-        llava_image_embed_free(image_embed);
-        ctx_llava->model = NULL;
-        llava_free(ctx_llava);
-    } else {
-        for (auto & image : params.image) {
-            auto * ctx_llava = llava_init_context(&params, model);
-
-            auto * image_embed = load_image(ctx_llava, &params, image);
-            if (!image_embed) {
-                LOG_ERR("%s: failed to load image %s. Terminating\n\n", __func__, image.c_str());
-                return 1;
-            }
-
-            // process the prompt
-            process_prompt(ctx_llava, image_embed, &params, params.prompt);
-
-            llama_perf_context_print(ctx_llava->ctx_llama);
-            llava_image_embed_free(image_embed);
-            ctx_llava->model = NULL;
-            llava_free(ctx_llava);
-        }
-    }
-
-    llama_model_free(model);
-
-    return 0;
-}
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
deleted file mode 100644
index 5ad970c220528..0000000000000
--- a/examples/llava/minicpmv-cli.cpp
+++ /dev/null
@@ -1,354 +0,0 @@
-#include "arg.h"
-#include "log.h"
-#include "common.h"
-#include "sampling.h"
-#include "clip.h"
-#include "llava.h"
-#include "llama.h"
-#include "ggml.h"
-
-#include <algorithm>
-#include <cstdio>
-#include <cstdlib>
-#include <cstring>
-#include <vector>
-#include <iostream> // TODO: remove me
-
-struct llava_context {
-    struct clip_ctx * ctx_clip = NULL;
-    struct llama_context * ctx_llama = NULL;
-    struct llama_model * model = NULL;
-};
-
-static void show_additional_info(int /*argc*/, char ** argv) {
-    LOG("\nexample usage:\n\n%s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
-    LOG("\nnote: a lower temperature value like 0.1 is recommended for better quality.\n");
-}
-
-static struct llama_model * llava_init(common_params * params) {
-    llama_backend_init();
-    llama_numa_init(params->numa);
-
-    llama_model_params model_params = common_model_params_to_llama(*params);
-
-    llama_model * model = llama_model_load_from_file(params->model.path.c_str(), model_params);
-    if (model == NULL) {
-        LOG_ERR("%s: unable to load model\n" , __func__);
-        return NULL;
-    }
-    return model;
-}
-
-static struct llava_context * llava_init_context(common_params * params, llama_model * model) {
-    auto prompt = params->prompt;
-    if (prompt.empty()) {
-        prompt = "describe the image in detail.";
-    }
-
-    llama_context_params ctx_params = common_context_params_to_llama(*params);
-    if (params->n_ctx < 2048) {
-        // warn user here, "Image processing requires at least 2048 context, setting context to 2048"
-        LOG_WRN("%s: Image processing requires at least 2048 context, setting context to 2048\n" , __func__);
-        ctx_params.n_ctx = 2048;
-    } else {
-        ctx_params.n_ctx = params->n_ctx;
-    }
-
-    llama_context * ctx_llama = llama_init_from_model(model, ctx_params);
-
-    if (ctx_llama == NULL) {
-        LOG_ERR("%s: failed to create the llama_context\n" , __func__);
-        return NULL;
-    }
-
-    auto * ctx_llava = (struct llava_context *)malloc(sizeof(llava_context));
-
-    ctx_llava->ctx_llama = ctx_llama;
-    ctx_llava->model = model;
-    return ctx_llava;
-}
-
-static void llava_free(struct llava_context * ctx_llava) {
-    if (ctx_llava->ctx_clip) {
-        clip_free(ctx_llava->ctx_clip);
-        ctx_llava->ctx_clip = NULL;
-    }
-
-    llama_free(ctx_llava->ctx_llama);
-    llama_model_free(ctx_llava->model);
-    llama_backend_free();
-}
-
-static struct clip_ctx * clip_init_context(common_params * params) {
-    const char * clip_path = params->mmproj.path.c_str();
-
-    auto prompt = params->prompt;
-    if (prompt.empty()) {
-        prompt = "describe the image in detail.";
-    }
-    struct clip_context_params clip_params = {
-        /* use_gpu */   params->n_gpu_layers != 0,
-        /* verbosity */ GGML_LOG_LEVEL_INFO, // TODO: make this configurable
-    };
-    auto * ctx_clip = clip_init(clip_path, clip_params);
-    return ctx_clip;
-}
-
-static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past) {
-    int N = (int) tokens.size();
-    for (int i = 0; i < N; i += n_batch) {
-        int n_eval = (int) tokens.size() - i;
-        if (n_eval > n_batch) {
-            n_eval = n_batch;
-        }
-        if (llama_decode(ctx_llama, llama_batch_get_one(&tokens[i], n_eval))) {
-            LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
-            return false;
-        }
-        *n_past += n_eval;
-    }
-    return true;
-}
-
-static bool eval_id(struct llama_context * ctx_llama, int id, int * n_past) {
-    std::vector<llama_token> tokens;
-    tokens.push_back(id);
-    return eval_tokens(ctx_llama, tokens, 1, n_past);
-}
-
-static bool eval_string(struct llama_context * ctx_llama, const char* str, int n_batch, int * n_past, bool add_bos){
-    std::string              str2     = str;
-    std::vector<llama_token> embd_inp = common_tokenize(ctx_llama, str2, add_bos, true);
-    return eval_tokens(ctx_llama, embd_inp, n_batch, n_past);
-}
-
-static void process_eval_image_embed(struct llava_context * ctx_llava, const struct llava_image_embed * embeds, int n_batch, int * n_past, int idx) {
-    float * image_embed = (float *)malloc(clip_embd_nbytes(ctx_llava->ctx_clip));
-    std::memcpy(image_embed, embeds->embed + idx * clip_n_patches(ctx_llava->ctx_clip) * clip_n_mmproj_embd(ctx_llava->ctx_clip), clip_embd_nbytes(ctx_llava->ctx_clip));
-
-    auto * slice_embed = (llava_image_embed*)malloc(sizeof(llava_image_embed));
-    slice_embed->embed = image_embed;
-    slice_embed->n_image_pos = clip_n_patches(ctx_llava->ctx_clip);
-    llava_eval_image_embed(ctx_llava->ctx_llama, slice_embed, n_batch, n_past);
-    llava_image_embed_free(slice_embed);
-}
-
-static void process_image(struct llava_context * ctx_llava, struct llava_image_embed * embeds, common_params * params, int &n_past) {
-    std::string system_prompt;
-    int idx = 0;
-    int num_image_embeds = embeds->n_image_pos / clip_n_patches(ctx_llava->ctx_clip);
-    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
-    if (has_minicpmv_projector == 2) {
-        system_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n";
-    }
-    else if (has_minicpmv_projector == 3) {
-        system_prompt = "<|im_start|>user\n";
-    }
-    else if (has_minicpmv_projector == 4) {
-        system_prompt = "<|im_start|>user\n";
-    }
-    LOG_INF("%s: image token past: %d\n", __func__, n_past);
-    eval_string(ctx_llava->ctx_llama, (system_prompt+"<image>").c_str(), params->n_batch, &n_past, false);
-    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
-    eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
-    if (num_image_embeds > 1) {
-        if (has_minicpmv_projector == 2) {
-            size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
-            eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
-            for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
-                for (size_t j = 0; j < num_image_embeds_col; ++j) {
-                    eval_string(ctx_llava->ctx_llama, std::string("<image>").c_str(), params->n_batch, &n_past, false);
-                    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
-                    eval_string(ctx_llava->ctx_llama, std::string("</image>").c_str(), params->n_batch, &n_past, false);
-                    if (j == num_image_embeds_col - 1) {
-                        eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
-                    }
-                }
-            }
-            eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
-        }
-        else if (has_minicpmv_projector == 3 || has_minicpmv_projector == 4) {
-            size_t num_image_embeds_col = clip_uhd_num_image_embeds_col(ctx_llava->ctx_clip);
-            for (size_t i = 0; i < (num_image_embeds-1)/num_image_embeds_col; ++i) {
-                for (size_t j = 0; j < num_image_embeds_col; ++j) {
-                    eval_string(ctx_llava->ctx_llama, std::string("<slice>").c_str(), params->n_batch, &n_past, false);
-                    process_eval_image_embed(ctx_llava, embeds, params->n_batch, &n_past, idx++);
-                    eval_string(ctx_llava->ctx_llama, std::string("</slice>").c_str(), params->n_batch, &n_past, false);
-                    if (j == num_image_embeds_col - 1) {
-                        eval_string(ctx_llava->ctx_llama, std::string("\n").c_str(), params->n_batch, &n_past, false);
-                    }
-                }
-            }
-        }
-    }
-    LOG_INF("%s: image token past: %d\n", __func__, n_past);
-}
-
-static const char * sample(struct common_sampler * smpl,
-                           struct llama_context * ctx_llama,
-                           int * n_past) {
-    const llama_token id = common_sampler_sample(smpl, ctx_llama, -1);
-    common_sampler_accept(smpl, id, true);
-
-    const llama_model * model = llama_get_model(ctx_llama);
-    const llama_vocab * vocab = llama_model_get_vocab(model);
-
-    static std::string ret;
-    if (llama_vocab_is_eog(vocab, id)) {
-        ret = "</s>";
-    } else {
-        ret = common_token_to_piece(ctx_llama, id);
-    }
-    eval_id(ctx_llama, id, n_past);
-    return ret.c_str();
-}
-
-static struct llava_context * minicpmv_init(common_params * params, const std::string & fname, int &n_past){
-    auto * ctx_clip = clip_init_context(params);
-    auto * embeds = llava_image_embed_make_with_filename(ctx_clip, params->cpuparams.n_threads, fname.c_str());
-    if (!embeds) {
-        LOG_ERR("failed to load image %s. Terminating\n\n", fname.c_str());
-        return NULL;
-    }
-
-    // process the prompt
-    if (params->prompt.empty() && params->interactive == false) {
-        LOG_ERR("prompt should be given or interactive mode should be on");
-        return NULL;
-    }
-
-    auto * model = llava_init(params);
-    if (model == NULL) {
-        fprintf(stderr, "%s: error: failed to init minicpmv model\n", __func__);
-        return NULL;
-    }
-    const int64_t t_llava_init_start_us = ggml_time_us();
-    auto * ctx_llava = llava_init_context(params, model);
-    ctx_llava->ctx_clip = ctx_clip;
-    const int64_t t_llava_init_end_us = ggml_time_us();
-    float t_llava_init_ms = (t_llava_init_end_us - t_llava_init_start_us) / 1000.0;
-    LOG_INF("%s: llava init in %8.2f ms.\n", __func__, t_llava_init_ms);
-
-    const int64_t t_process_image_start_us = ggml_time_us();
-    process_image(ctx_llava, embeds, params, n_past);
-    const int64_t t_process_image_end_us = ggml_time_us();
-    float t_process_image_ms = (t_process_image_end_us - t_process_image_start_us) / 1000.0;
-    LOG_INF("%s: llama process image in %8.2f ms.\n", __func__, t_process_image_ms);
-
-    llava_image_embed_free(embeds);
-    return ctx_llava;
-}
-
-static struct common_sampler * llama_init(struct llava_context * ctx_llava, common_params * params, const std::string & prompt, int & n_past, bool is_first = false){
-    std::string user_prompt = prompt;
-    int has_minicpmv_projector = clip_is_minicpmv(ctx_llava->ctx_clip);
-    if (!is_first) {
-        if (has_minicpmv_projector == 2) {
-            user_prompt = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n" + prompt;
-        }
-        else if (has_minicpmv_projector == 3) {
-            user_prompt = "<|im_start|>user\n" + prompt;
-        }
-        else if (has_minicpmv_projector == 4) {
-            user_prompt = "<|im_start|>user\n" + prompt;
-        }
-    }
-
-    eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
-    if (has_minicpmv_projector == 2) {
-        eval_string(ctx_llava->ctx_llama, "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n", params->n_batch, &n_past, false);
-    }
-    else if (has_minicpmv_projector == 3) {
-        eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
-    }
-    else if (has_minicpmv_projector == 4) {
-        eval_string(ctx_llava->ctx_llama, "<|im_end|><|im_start|>assistant\n", params->n_batch, &n_past, false);
-    }
-
-    // generate the response
-
-    LOG_INF("\n");
-
-    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
-    return smpl;
-}
-
-static const char * llama_loop(struct llava_context * ctx_llava,struct common_sampler * smpl, int &n_past){
-
-    const char * tmp = sample(smpl, ctx_llava->ctx_llama, &n_past);
-    return tmp;
-}
-
-int main(int argc, char ** argv) {
-    ggml_time_init();
-
-    common_params params;
-
-    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_LLAVA, show_additional_info)) {
-        return 1;
-    }
-
-    common_init();
-
-    if (params.mmproj.path.empty() || (params.image.empty())) {
-        show_additional_info(argc, argv);
-        return 1;
-    }
-
-    for (auto & image : params.image) {
-        int n_past = 0;
-        auto * ctx_llava = minicpmv_init(&params, image, n_past);
-
-        if (!params.prompt.empty()) {
-            LOG("<user>%s\n", params.prompt.c_str());
-            LOG("<assistant>");
-            auto * smpl = llama_init(ctx_llava, &params, params.prompt, n_past, true);
-            const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-            std::string response;
-            bool have_tmp = false;
-            for (int i = 0; i < max_tgt_len; i++) {
-                const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
-                response += tmp;
-                if (strcmp(tmp, "</s>") == 0){
-                    if (!have_tmp) {
-                        continue;
-                    }
-                    break;
-                }
-                if (strstr(tmp, "###")) break; // Yi-VL behavior
-                have_tmp = true;
-                printf("%s", tmp);
-                if (strstr(response.c_str(), "<user>")) break; // minicpm-v
-
-                fflush(stdout);
-            }
-            common_sampler_free(smpl);
-        }else {
-            while (true) {
-                LOG("<user>");
-                std::string prompt;
-                std::getline(std::cin, prompt);
-                LOG("<assistant>");
-                auto * smpl = llama_init(ctx_llava, &params, prompt, n_past, true);
-                const int max_tgt_len = params.n_predict < 0 ? 256 : params.n_predict;
-                std::string response;
-                for (int i = 0; i < max_tgt_len; i++) {
-                    const auto * tmp = llama_loop(ctx_llava, smpl, n_past);
-                    response += tmp;
-                    if (strcmp(tmp, "</s>") == 0) break;
-                    printf("%s", tmp);// mistral llava-1.6
-                    if (strstr(response.c_str(), "<user>")) break; // minicpm-v
-                    fflush(stdout);
-                }
-                common_sampler_free(smpl);
-            }
-        }
-        printf("\n");
-        llama_perf_context_print(ctx_llava->ctx_llama);
-
-        ctx_llava->model = NULL;
-        llava_free(ctx_llava);
-    }
-
-    return 0;
-}
diff --git a/examples/llava/gemma3-cli.cpp b/examples/llava/mtmd-cli.cpp
similarity index 82%
rename from examples/llava/gemma3-cli.cpp
rename to examples/llava/mtmd-cli.cpp
index 3d56647506c2c..e80845a2c5469 100644
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/mtmd-cli.cpp
@@ -28,15 +28,16 @@ static bool g_is_generating = false;
 
 /**
  * Please note that this is NOT a production-ready stuff.
- * It is a playground for trying Gemma 3 vision capabilities.
+ * It is a playground for trying multimodal support in llama.cpp.
  * For contributors: please keep this code simple and easy to understand.
  */
 
 static void show_additional_info(int /*argc*/, char ** argv) {
     LOG(
-        "Experimental CLI for using Gemma 3 vision model\n\n"
+        "Experimental CLI for multimodal\n\n"
         "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
         "  -m and --mmproj are required\n"
+        "  -hf user/repo can replace both -m and --mmproj in most cases\n"
         "  --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
         argv[0]
     );
@@ -56,7 +57,7 @@ static void sigint_handler(int signo) {
 }
 #endif
 
-struct gemma3_context {
+struct mtmd_cli_context {
     mtmd_context_ptr ctx_vision;
     common_init_result llama_init;
 
@@ -70,18 +71,38 @@ struct gemma3_context {
     // so here we don't need to keep track of chat history
     common_chat_templates_ptr tmpls;
 
+    // support for legacy templates (models not having EOT token)
+    llama_tokens antiprompt_tokens;
+
     int n_threads    = 1;
     llama_pos n_past = 0;
 
-    gemma3_context(common_params & params) : llama_init(common_init_from_params(params)) {
+    mtmd_cli_context(common_params & params) : llama_init(common_init_from_params(params)) {
         model = llama_init.model.get();
         lctx = llama_init.context.get();
         vocab = llama_model_get_vocab(model);
         n_threads = params.cpuparams.n_threads;
         batch = llama_batch_init(params.n_batch, 0, 1);
         n_batch = params.n_batch;
+
+        if (!llama_model_chat_template(model, nullptr) && params.chat_template.empty()) {
+            LOG_ERR("Model does not have chat template.\n");
+            LOG_ERR("  For old llava models, you may need to use '--chat-template vicuna'\n");
+            LOG_ERR("  For MobileVLM models, use '--chat-template deepseek'\n");
+            exit(1);
+        }
+
         tmpls = common_chat_templates_init(model, params.chat_template);
+        LOG_INF("%s: chat template example:\n%s\n", __func__, common_chat_format_example(tmpls.get(), params.use_jinja).c_str());
+
         init_vision_context(params);
+
+        // load antiprompt tokens for legacy templates
+        if (params.chat_template == "vicuna") {
+            antiprompt_tokens = common_tokenize(lctx, "ASSISTANT:", false, true);
+        } else if (params.chat_template == "deepseek") {
+            antiprompt_tokens = common_tokenize(lctx, "###", false, true);
+        }
     }
 
     void init_vision_context(common_params & params) {
@@ -97,6 +118,17 @@ struct gemma3_context {
             exit(1);
         }
     }
+
+    bool check_antiprompt(const llama_tokens & generated_tokens) {
+        if (antiprompt_tokens.empty() || generated_tokens.size() < antiprompt_tokens.size()) {
+            return false;
+        }
+        return std::equal(
+            generated_tokens.end() - antiprompt_tokens.size(),
+            generated_tokens.end(),
+            antiprompt_tokens.begin()
+        );
+    }
 };
 
 struct decode_embd_batch {
@@ -132,7 +164,8 @@ struct decode_embd_batch {
     }
 };
 
-static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_predict) {
+static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
+    llama_tokens generated_tokens;
     for (int i = 0; i < n_predict; i++) {
         if (i > n_predict || !g_is_generating) {
             printf("\n");
@@ -140,9 +173,10 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_
         }
 
         llama_token token_id = common_sampler_sample(smpl, ctx.lctx, -1);
+        generated_tokens.push_back(token_id);
         common_sampler_accept(smpl, token_id, true);
 
-        if (llama_vocab_is_eog(ctx.vocab, token_id)) {
+        if (llama_vocab_is_eog(ctx.vocab, token_id) || ctx.check_antiprompt(generated_tokens)) {
             printf("\n");
             break; // end of generation
         }
@@ -161,7 +195,7 @@ static int generate_response(gemma3_context & ctx, common_sampler * smpl, int n_
     return 0;
 }
 
-static int eval_message(gemma3_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
+static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vector<std::string> & images_fname, bool add_bos = false) {
     std::vector<mtmd_bitmap> bitmaps;
 
     common_chat_templates_inputs tmpl_inputs;
@@ -218,7 +252,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
 
-    gemma3_context ctx(params);
+    mtmd_cli_context ctx(params);
     printf("%s: %s\n", __func__, params.model.path.c_str());
 
     bool is_single_turn = !params.prompt.empty() && !params.image.empty();
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 3fd5bebc6a7d5..8866c12e4ddef 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -12,6 +12,15 @@
 #include <limits>
 #include <vector>
 
+// slice template, used by some llava-uhd models to correctly place the special tokens around image embeddings
+// models not having it (llava-1.6) will process embeddings without any special tokens in-between
+enum mtmd_slice_tmpl {
+    MTMD_SLICE_TMPL_NONE,
+    MTMD_SLICE_TMPL_MINICPMV_2_5,
+    MTMD_SLICE_TMPL_MINICPMV_2_6,
+    // TODO @ngxson : add support for idefics (SmolVLM)
+};
+
 struct mtmd_context {
     struct clip_ctx * ctx_clip;
     const struct llama_model * text_model;
@@ -21,6 +30,16 @@ struct mtmd_context {
     int n_threads;
     std::string image_marker;
 
+    // for minicpmv, we need special tokens in-between slices
+    mtmd_slice_tmpl slice_tmpl    = MTMD_SLICE_TMPL_NONE;
+    llama_token tok_ov_img_start  = LLAMA_TOKEN_NULL; // overview image
+    llama_token tok_ov_img_end    = LLAMA_TOKEN_NULL; // overview image
+    llama_token tok_slices_start  = LLAMA_TOKEN_NULL; // start of all slices
+    llama_token tok_slices_end    = LLAMA_TOKEN_NULL; // end of all slices
+    llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice
+    llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice
+    llama_token tok_row_end       = LLAMA_TOKEN_NULL; // end of row
+
     // TODO @ngxson : add timings
 
     mtmd_context(const char * mmproj_fname,
@@ -38,11 +57,66 @@ struct mtmd_context {
             throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
         }
         this->text_model = text_model;
+
+        GGML_ASSERT(!clip_is_qwen2vl(ctx_clip) && "Qwen2VL model is not supported yet, use llama-qwen2vl-cli instead");
+
+        int minicpmv_version = clip_is_minicpmv(ctx_clip);
+        if (minicpmv_version == 2) {
+            // minicpmv 2.5 format:
+            // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
+            slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_5;
+            tok_ov_img_start  = lookup_token("<image>");
+            tok_ov_img_end    = lookup_token("</image>");
+            tok_slices_start  = lookup_token("<slice>");
+            tok_slices_end    = lookup_token("</slice>");
+            tok_sli_img_start = tok_ov_img_start;
+            tok_sli_img_end   = tok_ov_img_end;
+            tok_row_end       = lookup_token("\n");
+
+        } else if (minicpmv_version == 3 || minicpmv_version == 4) {
+            // minicpmv 2.6 format:
+            // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
+            slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_6;
+            tok_ov_img_start  = lookup_token("<image>");
+            tok_ov_img_end    = lookup_token("</image>");
+            tok_sli_img_start = lookup_token("<slice>");
+            tok_sli_img_end   = lookup_token("</slice>");
+            tok_row_end       = lookup_token("\n");
+
+        } else if (minicpmv_version != 0) {
+            GGML_ASSERT(false && "unsupported minicpmv version");
+        }
     }
 
     ~mtmd_context() {
         clip_free(ctx_clip);
     }
+
+private:
+    llama_token lookup_token(const std::string & token_text) {
+        const llama_vocab * vocab = llama_model_get_vocab(text_model);
+        const int n_vocab = llama_vocab_n_tokens(vocab);
+        for (int i = 0; i < n_vocab; i++) {
+            if (token_to_piece(vocab, i, true) == token_text) {
+                return i;
+            }
+        }
+        return LLAMA_TOKEN_NULL;
+    }
+
+    std::string token_to_piece(const llama_vocab * vocab, llama_token token, bool special) {
+        std::string piece;
+        piece.resize(piece.capacity());  // using string internal cache, 15 bytes + '\n'
+        const int n_chars = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+        if (n_chars < 0) {
+            piece.resize(-n_chars);
+            int check = llama_token_to_piece(vocab, token, &piece[0], piece.size(), 0, special);
+            GGML_ASSERT(check == -n_chars);
+        } else {
+            piece.resize(n_chars);
+        }
+        return piece;
+    }
 };
 
 struct mtmd_image_tokens_data {
@@ -102,21 +176,58 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
 
     std::string prompt_modified(text.text);
     std::string marker_modified(ctx->image_marker);
-    projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
     // a bit hacky here, but works for now
     // for some models, we need to add prefix and suffix to the image embeddings
-    if (proj_type == PROJECTOR_TYPE_GEMMA3) {
+    if (clip_is_gemma3(ctx->ctx_clip)) {
+        // gemma 3
         // <start_of_image> ... (image embeddings) ... <end_of_image>
         marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
     }
 
+    // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
+    // for glm-edge, we don't need to add because the tokens are already in the returned embeddings
+
+    // TODO @ngxson : glm-edge : remove BOI / EOI tokens embeddings, decode them as normal tokens
+
     std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
     output.clear();
     output.reserve(parts.size());
 
     size_t i_img = 0;
 
+    // utility for adding raw tokens
+    auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
+        mtmd_input_chunk chunk{
+            MTMD_INPUT_CHUNK_TYPE_TEXT,
+            std::move(tokens),
+            {},
+        };
+        output.emplace_back(std::move(chunk));
+    };
+
+    // utility for splitting batch of multiple images into chunks of batch having single images
+    auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
+        std::vector<mtmd_input_chunk> chunks;
+
+        for (auto & entry : batch_f32.entries) {
+            mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+            image_tokens->nx = clip_n_patches(ctx->ctx_clip);
+            image_tokens->ny = 1;
+            image_tokens->batch_f32.entries.push_back(std::move(entry));
+            image_tokens->id = id;
+
+            mtmd_input_chunk chunk{
+                MTMD_INPUT_CHUNK_TYPE_IMAGE,
+                {},
+                std::move(image_tokens),
+            };
+            chunks.emplace_back(std::move(chunk));
+        }
+
+        return chunks;
+    };
+
     for (const auto & part : parts) {
         //printf("tokenizing part: %s\n", part.c_str());
         bool add_bos = &parts.front() == &part;
@@ -139,12 +250,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                 return 1;
             }
 
-            // shim layer
+            // convert mtmd_bitmap to clip_image_u8
             clip_image_u8_ptr img_u8(clip_image_u8_init());
             img_u8->nx = bitmaps[i_img].nx;
             img_u8->ny = bitmaps[i_img].ny;
             img_u8->buf.resize(bitmaps[i_img].data.size());
             std::memcpy(img_u8->buf.data(), bitmaps[i_img].data.data(), img_u8->nx * img_u8->ny * 3);
+            clip_image_size img_u8_size{img_u8->nx, img_u8->ny};
 
             // preprocess image
             clip_image_f32_batch batch_f32;
@@ -154,19 +266,70 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                 return 2;
             }
 
-            mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-            image_tokens->nx = clip_n_patches(ctx->ctx_clip); // TODO @ngxson : use clip_n_patches_by_image
-            image_tokens->ny = 1; // TODO
-            image_tokens->batch_f32 = std::move(batch_f32);
-            image_tokens->id = bitmaps[i_img].id; // optional
+            if (ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5 || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6) {
+                // split batch into chunks of single images
+                auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_img].id);
+                GGML_ASSERT(chunks.size() > 0);
+
+                // add overview image
+                add_text_chunk({ctx->tok_ov_img_start});
+                output.emplace_back(std::move(chunks.front()));
+                chunks.erase(chunks.begin());
+                add_text_chunk({ctx->tok_ov_img_end});
+
+                // add slices
+                if (!chunks.empty()) {
+                    clip_add_load_image_size(ctx->ctx_clip, &img_u8_size);
+                    int n_col = clip_uhd_num_image_embeds_col(ctx->ctx_clip);
+                    int n_row = (int)chunks.size() / n_col;
+                    GGML_ASSERT(n_row * n_col == (int)chunks.size());
+                    if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
+                        add_text_chunk({ctx->tok_slices_start});
+                    }
+                    for (int y = 0; y < n_row; y++) {
+                        for (int x = 0; x < n_col; x++) {
+                            if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
+                                add_text_chunk({ctx->tok_sli_img_start});
+                            }
+                            output.emplace_back(std::move(chunks[y * n_col + x]));
+                            if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
+                                add_text_chunk({ctx->tok_sli_img_end});
+                            }
+                        }
+                        if (ctx->tok_row_end != LLAMA_TOKEN_NULL && y != n_row - 1) {
+                            add_text_chunk({ctx->tok_row_end});
+                        }
+                    }
+                    if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
+                        add_text_chunk({ctx->tok_slices_end});
+                    }
+                }
+
+            } else {
+                mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
+                image_tokens->nx = clip_n_patches(ctx->ctx_clip) * batch_f32.entries.size(); // TODO @ngxson : use clip_n_patches_by_image
+                image_tokens->ny = 1; // TODO
+                image_tokens->batch_f32 = std::move(batch_f32);
+                image_tokens->id = bitmaps[i_img].id; // optional
+
+                LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
+                LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
+                LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
+
+                if (clip_is_glm(ctx->ctx_clip)) {
+                    // glm-edge
+                    image_tokens->nx += 2; // add 2 for the begin_of_image and end_of_image token embeddings
+                }
+
+                mtmd_input_chunk chunk{
+                    MTMD_INPUT_CHUNK_TYPE_IMAGE,
+                    {},
+                    std::move(image_tokens),
+                };
+                output.emplace_back(std::move(chunk));
+            }
 
-            mtmd_input_chunk chunk{
-                MTMD_INPUT_CHUNK_TYPE_IMAGE,
-                {},
-                std::move(image_tokens),
-            };
-            output.emplace_back(std::move(chunk));
-            i_img++;
+            i_img++; // move to next image
         }
     }
 
@@ -198,11 +361,35 @@ std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
-    bool ok = clip_image_batch_encode(
-        ctx->ctx_clip,
-        ctx->n_threads,
-        &image_tokens->batch_f32,
-        ctx->image_embd_v.data());
+    bool ok = false;
+
+    // only effective for minicpmv and qwen2vl, other models will ignore load_image_size
+    {
+        clip_image_size slice_size{
+            image_tokens->batch_f32.entries[0]->nx,
+            image_tokens->batch_f32.entries[0]->ny};
+        clip_add_load_image_size(ctx->ctx_clip, &slice_size);
+    }
+
+    if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
+        // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
+        const auto & entries = image_tokens->batch_f32.entries;
+        for (size_t i = 0; i < entries.size(); i++) {
+            int n_tokens_per_image = clip_n_patches(ctx->ctx_clip);
+            ok = clip_image_encode(
+                ctx->ctx_clip,
+                ctx->n_threads,
+                entries[i].get(),
+                ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
+        }
+    } else {
+        ok = clip_image_batch_encode(
+            ctx->ctx_clip,
+            ctx->n_threads,
+            &image_tokens->batch_f32,
+            ctx->image_embd_v.data());
+    }
+
     return ok ? 0 : 1;
 }
 
@@ -268,28 +455,31 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
     int32_t ret;
     llama_pos n_past = pos0;
     llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
+    int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
 
     for (auto & chunk : chunks) {
         bool is_last = &chunk == &chunks.back();
         if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
-            // TODO @ngxson : may need to split into smaller batches
             text_batch.n_tokens = chunk.tokens_text.size();
-            for (size_t i = 0; i < chunk.tokens_text.size(); i++) {
-                text_batch.token   [i]    = chunk.tokens_text[i];
-                text_batch.pos     [i]    = n_past++;
-                text_batch.n_seq_id[i]    = 1;
-                text_batch.seq_id  [i][0] = seq_id;
-                text_batch.logits  [i]    = false;
-            }
-            if (is_last) {
-                // always get logits for last input chunk
-                text_batch.logits[text_batch.n_tokens - 1] = true;
-            }
-            ret = llama_decode(lctx, text_batch);
-            if (ret != 0) {
-                LOG_ERR("failed to decode text\n");
-                llama_batch_free(text_batch);
-                return ret;
+            size_t i = 0;
+            while (i < chunk.tokens_text.size()) { // split into batches
+                for (; i < chunk.tokens_text.size() && text_batch.n_tokens < n_batch; i++) {
+                    text_batch.token   [i]    = chunk.tokens_text[i];
+                    text_batch.pos     [i]    = n_past++;
+                    text_batch.n_seq_id[i]    = 1;
+                    text_batch.seq_id  [i][0] = seq_id;
+                    text_batch.logits  [i]    = false;
+                }
+                if (is_last) {
+                    // always get logits for last input chunk
+                    text_batch.logits[text_batch.n_tokens - 1] = true;
+                }
+                ret = llama_decode(lctx, text_batch);
+                if (ret != 0) {
+                    LOG_ERR("failed to decode text\n");
+                    llama_batch_free(text_batch);
+                    return ret;
+                }
             }
 
         } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
@@ -297,7 +487,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
             GGML_ASSERT(chunk.tokens_image != nullptr);
             int64_t t0 = ggml_time_ms();
             if (ctx->print_timings) {
-                LOG_INF("encoding image...\n");
+                LOG_INF("encoding image or slice...\n");
             }
             ret = mtmd_encode(ctx, chunk.tokens_image.get());
             if (ret != 0) {
@@ -306,24 +496,47 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
                 return ret;
             }
             if (ctx->print_timings) {
-                LOG_INF("image encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
+                LOG_INF("image/slice encoded in %" PRId64 " ms\n", ggml_time_ms() - t0);
             }
 
             int32_t n_tokens = mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
+            int32_t i_batch = 0;
+            int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
             float * embd = mtmd_get_output_embd(ctx);
-            decode_embd_batch batch_img(embd, n_tokens, n_past, 0);
-            int64_t t1 = ggml_time_ms();
-            ret = llama_decode(lctx, batch_img.batch);
-            if (ret != 0) {
-                LOG_ERR("failed to decode image\n");
-                llama_batch_free(text_batch);
-                return ret;
+
+            if (mtmd_decode_use_non_causal(ctx)) {
+                llama_set_causal_attn(lctx, false);
+                // TODO @ngxson : need to make sure only one image is processed at a time, and n_ubatch must be enough to hold the image
             }
-            if (ctx->print_timings) {
-                LOG_INF("image decoded in %" PRId64 " ms\n", ggml_time_ms() - t1);
+
+            while (i_batch < n_img_batches) { // split into batches
+                int32_t pos_offset = i_batch*n_batch;
+                int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
+                float * embd_batch = embd + pos_offset*n_mmproj_embd;
+                decode_embd_batch batch_img(embd_batch, n_tokens_batch, n_past, 0);
+
+                printf("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
+
+                int64_t t1 = ggml_time_ms();
+                ret = llama_decode(lctx, batch_img.batch);
+                if (ret != 0) {
+                    LOG_ERR("failed to decode image\n");
+                    llama_set_causal_attn(lctx, true); // restore causal attn
+                    llama_batch_free(text_batch);
+                    return ret;
+                }
+
+                if (ctx->print_timings) {
+                    LOG_INF("image decoded (batch %d/%d) in %" PRId64 " ms\n", i_batch+1, n_img_batches, ggml_time_ms() - t1);
+                }
+
+                i_batch++;
+                n_past += n_tokens_batch;
             }
 
-            n_past += n_tokens;
+            if (mtmd_decode_use_non_causal(ctx)) {
+                llama_set_causal_attn(lctx, true);
+            }
 
         } else {
             GGML_ASSERT(false && "chunk type not supported");
diff --git a/examples/llava/tests.sh b/examples/llava/tests.sh
index cc9bda8769ca6..61ebb3ac18ead 100755
--- a/examples/llava/tests.sh
+++ b/examples/llava/tests.sh
@@ -17,26 +17,30 @@ cd $PROJ_ROOT
 
 arr_bin=()
 arr_hf=()
+arr_tmpl=() # chat template
 
 add_test() {
     local bin=$1
     local hf=$2
+    local tmpl=${3:-""} # default to empty string if not provided
     arr_bin+=("$bin")
     arr_hf+=("$hf")
+    arr_tmpl+=("$tmpl")
 }
 
-add_test "llama-gemma3-cli"   "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
-add_test "llama-llava-cli"    "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
-add_test "llama-llava-cli"    "guinmoon/MobileVLM-3B-GGUF:Q4_K_M"
-add_test "llama-llava-cli"    "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
-add_test "llama-llava-cli"    "second-state/Llava-v1.5-7B-GGUF:Q2_K"
-add_test "llama-llava-cli"    "cjpais/llava-1.6-mistral-7b-gguf:Q3_K"
-add_test "llama-llava-cli"    "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
-add_test "llama-minicpmv-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
-add_test "llama-minicpmv-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
-add_test "llama-minicpmv-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
+add_test "llama-mtmd-cli"  "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
+add_test "llama-mtmd-cli"  "guinmoon/MobileVLM-3B-GGUF:Q4_K_M"               "deepseek"
+add_test "llama-mtmd-cli"  "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
+add_test "llama-mtmd-cli"  "second-state/Llava-v1.5-7B-GGUF:Q2_K"            "vicuna"
+add_test "llama-mtmd-cli"  "cjpais/llava-1.6-mistral-7b-gguf:Q3_K"           "vicuna"
+add_test "llama-mtmd-cli"  "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
+add_test "llama-mtmd-cli"  "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # model from openbmb is corrupted
+add_test "llama-mtmd-cli"  "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
+add_test "llama-mtmd-cli"  "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
 add_test "llama-qwen2vl-cli"  "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
 
+# add_test "llama-mtmd-cli"  "cmp-nct/Yi-VL-6B-GGUF:Q5_K"  # this model has broken chat template, not usable
+
 ###############
 
 cmake --build build -j --target "${arr_bin[@]}"
@@ -46,12 +50,20 @@ arr_res=()
 for i in "${!arr_bin[@]}"; do
     bin="${arr_bin[$i]}"
     hf="${arr_hf[$i]}"
+    tmpl="${arr_tmpl[$i]}"
 
     echo "Running test with binary: $bin and HF model: $hf"
     echo ""
     echo ""
 
-    output=$("$PROJ_ROOT/build/bin/$bin" -hf "$hf" --image $SCRIPT_DIR/test-1.jpeg -p "what is the publisher name of the newspaper?" --temp 0 2>&1 | tee /dev/tty)
+    output=$(\
+        "$PROJ_ROOT/build/bin/$bin" \
+        -hf "$hf" \
+        --image $SCRIPT_DIR/test-1.jpeg \
+        -p "what is the publisher name of the newspaper?" \
+        --temp 0 -n 128 \
+        ${tmpl:+--chat-template "$tmpl"} \
+        2>&1 | tee /dev/tty)
 
     echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
 
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
index 721faa4e8147e..f62850ca574b0 100644
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -121,6 +121,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
         return LLM_CHAT_TEMPLATE_PHI_3;
     } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
         return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
+    } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
+        return LLM_CHAT_TEMPLATE_GLMEDGE;
     } else if (tmpl_contains("<|user|>") && tmpl_contains("<|endoftext|>")) {
         return LLM_CHAT_TEMPLATE_ZEPHYR;
     } else if (tmpl_contains("bos_token + message['role']")) {

From 5368ddda7a262d195b54687a31009dcc1f8b1602 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Mon, 21 Apr 2025 19:13:30 +0530
Subject: [PATCH 006/200] SYCL: Add non-contiguous support in ROPE (#12993)

ggml-ci
---
 ggml/src/ggml-sycl/ggml-sycl.cpp |   7 +-
 ggml/src/ggml-sycl/rope.cpp      | 197 +++++++++++++++----------------
 ggml/src/ggml-sycl/rope.hpp      |   2 +-
 3 files changed, 96 insertions(+), 110 deletions(-)

diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 1de34c96298b9..8081a77b74f67 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -3168,11 +3168,6 @@ static void ggml_sycl_diag_mask_inf(ggml_backend_sycl_context & ctx, ggml_tensor
     ggml_sycl_op_diag_mask_inf(ctx, dst);
 }
 
-static void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous(dst->src[0])); // TODO: this restriction is temporary until non-cont support is implemented
-    ggml_sycl_op_rope(ctx, dst);
-}
-
 static void ggml_sycl_pool2d(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     ggml_sycl_op_pool2d(ctx, dst);
 }
@@ -4002,7 +3997,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                 if (mode == GGML_ROPE_TYPE_MROPE) {
                     return false;
                 }
-                return ggml_is_contiguous(op->src[0]);
+                return true;
             }
         case GGML_OP_IM2COL:
             return true;
diff --git a/ggml/src/ggml-sycl/rope.cpp b/ggml/src/ggml-sycl/rope.cpp
index 80e050f241496..4e276d3b62e42 100644
--- a/ggml/src/ggml-sycl/rope.cpp
+++ b/ggml/src/ggml-sycl/rope.cpp
@@ -34,23 +34,21 @@ static void rope_yarn(
     *sin_theta = sycl::sin(theta) * mscale;
 }
 
-template<typename T, bool has_ff>
-static void rope_norm(
-    const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors,
-    const sycl::nd_item<3> &item_ct1) {
-    const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                         item_ct1.get_local_id(1));
+template <typename T, bool has_ff>
+static void rope_norm(const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
+                      const int32_t * pos, float freq_scale, float ext_factor, float attn_factor,
+                      const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors,
+                      const sycl::nd_item<3> & item_ct1) {
+    const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + item_ct1.get_local_id(1));
 
     if (i0 >= ne0) {
         return;
     }
 
-    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
+    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2);
 
     if (i0 >= n_dims) {
-        const int i = row*ne0 + i0;
+        const int i = row * ne0 + i0;
 
         dst[i + 0] = x[i + 0];
         dst[i + 1] = x[i + 1];
@@ -58,42 +56,43 @@ static void rope_norm(
         return;
     }
 
-    const int i = row*ne0 + i0;
-    const int i2 = row/p_delta_rows;
+    const int row0     = row % ne1;
+    const int channel0 = row / ne1;
+
+    const int i  = row * ne0 + i0;
+    const int i2 = channel0 * s2 + row0 * s1 + i0;
 
-    const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);
+    const float theta_base = pos[channel0] * sycl::pow(theta_scale, i0 / 2.0f);
 
-    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+    const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
 
     float cos_theta;
     float sin_theta;
 
-    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    rope_yarn(theta_base / freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
-    const float x0 = x[i + 0];
-    const float x1 = x[i + 1];
+    const float x0 = x[i2 + 0];
+    const float x1 = x[i2 + 1];
 
-    dst[i + 0] = x0*cos_theta - x1*sin_theta;
-    dst[i + 1] = x0*sin_theta + x1*cos_theta;
+    dst[i + 0] = x0 * cos_theta - x1 * sin_theta;
+    dst[i + 1] = x0 * sin_theta + x1 * cos_theta;
 }
 
-template<typename T, bool has_ff>
-static void rope_neox(
-    const T * x, T * dst, int ne0, int n_dims, const int32_t * pos, float freq_scale, int p_delta_rows,
-    float ext_factor, float attn_factor, rope_corr_dims corr_dims, float theta_scale, const float * freq_factors,
-    const sycl::nd_item<3> &item_ct1) {
-    const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) +
-                         item_ct1.get_local_id(1));
+template <typename T, bool has_ff>
+static void rope_neox(const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2, const int n_dims,
+                      const int32_t * pos, const float freq_scale, const float ext_factor, const float attn_factor,
+                      const rope_corr_dims corr_dims, const float theta_scale, const float * freq_factors,
+                      const sycl::nd_item<3> & item_ct1) {
+    const int i0 = 2 * (item_ct1.get_local_range(1) * item_ct1.get_group(1) + item_ct1.get_local_id(1));
 
     if (i0 >= ne0) {
         return;
     }
 
-    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
-                    item_ct1.get_local_id(2);
+    const int row = item_ct1.get_local_range(2) * item_ct1.get_group(2) + item_ct1.get_local_id(2);
 
     if (i0 >= n_dims) {
-        const int i = row*ne0 + i0;
+        const int i = row * ne0 + i0;
 
         dst[i + 0] = x[i + 0];
         dst[i + 1] = x[i + 1];
@@ -101,23 +100,26 @@ static void rope_neox(
         return;
     }
 
-    const int i  = row*ne0 + i0/2;
-    const int i2 = row/p_delta_rows;
+    const int row0     = row % ne1;
+    const int channel0 = row / ne1;
+
+    const int i  = row * ne0 + i0 / 2;
+    const int i2 = channel0 * s2 + row0 * s1 + i0 / 2;
 
-    const float theta_base = pos[i2] * sycl::pow(theta_scale, i0 / 2.0f);
+    const float theta_base = pos[channel0] * sycl::pow(theta_scale, i0 / 2.0f);
 
-    const float freq_factor = has_ff ? freq_factors[i0/2] : 1.0f;
+    const float freq_factor = has_ff ? freq_factors[i0 / 2] : 1.0f;
 
     float cos_theta;
     float sin_theta;
 
-    rope_yarn(theta_base/freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
+    rope_yarn(theta_base / freq_factor, freq_scale, corr_dims, i0, ext_factor, attn_factor, &cos_theta, &sin_theta);
 
-    const float x0 = x[i + 0];
-    const float x1 = x[i + n_dims/2];
+    const float x0 = x[i2 + 0];
+    const float x1 = x[i2 + n_dims / 2];
 
-    dst[i + 0]        = x0*cos_theta - x1*sin_theta;
-    dst[i + n_dims/2] = x0*sin_theta + x1*cos_theta;
+    dst[i + 0]          = x0 * cos_theta - x1 * sin_theta;
+    dst[i + n_dims / 2] = x0 * sin_theta + x1 * cos_theta;
 }
 
 template <typename T, bool has_ff>
@@ -163,18 +165,18 @@ static void rope_vision(const T * x, T * dst, const int ne0, const int ne1, cons
 }
 
 template <typename T>
-static void rope_norm_sycl(
-    const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
+static void rope_norm_sycl(const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2,
+                           const int n_dims, int nr, const int32_t * pos, const float freq_scale, const float freq_base,
+                           const float ext_factor, const float attn_factor, const rope_corr_dims corr_dims,
+                           const float * freq_factors, queue_ptr stream) {
     GGML_ASSERT(ne0 % 2 == 0);
     const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
+    const int            num_blocks_x = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
     const sycl::range<3> block_nums(1, num_blocks_x, nr);
 
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float theta_scale = powf(freq_base, -2.0f / n_dims);
 
-    dpct::has_capability_or_fail(stream->get_device(),
-                                     {sycl::aspect::fp16});
+    dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
 
     if (freq_factors == nullptr) {
         /*
@@ -182,61 +184,47 @@ static void rope_norm_sycl(
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope_norm<T, false>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
-                               ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
-                               item_ct1);
-            });
+        stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            rope_norm<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
+                                theta_scale, freq_factors, item_ct1);
+        });
     } else {
         /*
         DPCT1049:41: The work-group size passed to the SYCL kernel may exceed
         the limit. To get the device limit, query
         info::device::max_work_group_size. Adjust the work-group size if needed.
         */
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope_norm<T, true>(x, dst, ne0, n_dims, pos, freq_scale, p_delta_rows,
-                              ext_factor, attn_factor, corr_dims, theta_scale, freq_factors,
-                              item_ct1);
-            });
+        stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            rope_norm<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
+                               theta_scale, freq_factors, item_ct1);
+        });
     }
 }
 
 template <typename T>
-static void rope_neox_sycl(
-    const T *x, T *dst, int ne0, int n_dims, int nr, const int32_t *pos, float freq_scale, int p_delta_rows,
-    float freq_base, float ext_factor, float attn_factor, rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
+static void rope_neox_sycl(const T * x, T * dst, const int ne0, const int ne1, const int s1, const int s2,
+                           const int n_dims, const int nr, const int32_t * pos, const float freq_scale,
+                           const float freq_base, const float ext_factor, const float attn_factor,
+                           const rope_corr_dims corr_dims, const float * freq_factors, queue_ptr stream) {
     GGML_ASSERT(ne0 % 2 == 0);
     const sycl::range<3> block_dims(1, SYCL_ROPE_BLOCK_SIZE, 1);
-    const int num_blocks_x = (ne0 + 2*SYCL_ROPE_BLOCK_SIZE - 1) / (2*SYCL_ROPE_BLOCK_SIZE);
+    const int            num_blocks_x = (ne0 + 2 * SYCL_ROPE_BLOCK_SIZE - 1) / (2 * SYCL_ROPE_BLOCK_SIZE);
     const sycl::range<3> block_nums(1, num_blocks_x, nr);
 
-    const float theta_scale = powf(freq_base, -2.0f/n_dims);
+    const float theta_scale = powf(freq_base, -2.0f / n_dims);
 
-    dpct::has_capability_or_fail(stream->get_device(),
-                                    {sycl::aspect::fp16});
+    dpct::has_capability_or_fail(stream->get_device(), { sycl::aspect::fp16 });
 
     if (freq_factors == nullptr) {
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope_neox<T, false>(x, dst, ne0, n_dims, pos, freq_scale,
-                                    p_delta_rows, ext_factor, attn_factor,
-                                    corr_dims, theta_scale, freq_factors,
-                                    item_ct1);
-            });
+        stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            rope_neox<T, false>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
+                                theta_scale, freq_factors, item_ct1);
+        });
     } else {
-        stream->parallel_for(
-            sycl::nd_range<3>(block_nums * block_dims, block_dims),
-            [=](sycl::nd_item<3> item_ct1) {
-                rope_neox<T, true>(x, dst, ne0, n_dims, pos, freq_scale,
-                                    p_delta_rows, ext_factor, attn_factor,
-                                    corr_dims, theta_scale, freq_factors,
-                                    item_ct1);
-            });
+        stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) {
+            rope_neox<T, true>(x, dst, ne0, ne1, s1, s2, n_dims, pos, freq_scale, ext_factor, attn_factor, corr_dims,
+                               theta_scale, freq_factors, item_ct1);
+        });
     }
 }
 
@@ -272,7 +260,7 @@ static void rope_vision_sycl(const T * x, T * dst, const int ne0, const int ne1,
     }
 }
 
-void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+inline void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
 
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
     GGML_ASSERT( dst->type == GGML_TYPE_F32 ||  dst->type == GGML_TYPE_F16);
@@ -329,43 +317,46 @@ void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
     if (is_neox) {
         GGML_SYCL_DEBUG("%s: neox path\n", __func__);
         if (dst->src[0]->type == GGML_TYPE_F32) {
-            rope_neox_sycl(
-                (const float *)dst->src[0]->data, (float *)dst->data, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, freq_factors, main_stream
-            );
+            rope_neox_sycl((const float *) dst->src[0]->data, (float *) dst->data, ne00, ne01, s01, s02, n_dims, nr,
+                           pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, main_stream);
         } else if (dst->src[0]->type == GGML_TYPE_F16) {
-            rope_neox_sycl(
-                (const sycl::half *)dst->src[0]->data, (sycl::half *)dst->data, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, freq_factors, main_stream
-            );
+            rope_neox_sycl((const sycl::half *) dst->src[0]->data, (sycl::half *) dst->data, ne00, ne01, s01, s02,
+                           n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors,
+                           main_stream);
         } else {
             GGML_ABORT("fatal error");
         }
     } else if (is_vision) {
         GGML_SYCL_DEBUG("%s: vision path\n", __func__);
         if (dst->src[0]->type == GGML_TYPE_F16) {
-            rope_vision_sycl((const sycl::half *)dst->src[0]->data, (sycl::half *)dst->data, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
-                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, main_stream);
+            rope_vision_sycl((const sycl::half *) dst->src[0]->data, (sycl::half *) dst->data, ne00, ne01, ne02, s01,
+                             s02, n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims,
+                             freq_factors, sections, main_stream);
         } else if (dst->src[0]->type == GGML_TYPE_F32) {
-            rope_vision_sycl((const float *) dst->src[0]->data, (float *)dst->data, ne00, ne01, ne02, s01, s02, n_dims, nr, pos, freq_scale,
-                freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections, main_stream);
+            rope_vision_sycl((const float *) dst->src[0]->data, (float *) dst->data, ne00, ne01, ne02, s01, s02, n_dims,
+                             nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, sections,
+                             main_stream);
         } else {
             GGML_ABORT("Fatal error: Tensor type unsupported!");
         }
     } else {
         GGML_SYCL_DEBUG("%s: norm path\n", __func__);
         if (dst->src[0]->type == GGML_TYPE_F32) {
-            rope_norm_sycl(
-                (const float *)dst->src[0]->data, (float *)dst->data, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, freq_factors, main_stream
-            );
+            rope_norm_sycl((const float *) dst->src[0]->data, (float *) dst->data, ne00, ne01, s01, s02, n_dims, nr,
+                           pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors, main_stream);
         } else if (dst->src[0]->type == GGML_TYPE_F16) {
-            rope_norm_sycl(
-                (const sycl::half *)dst->src[0]->data, (sycl::half *)dst->data, ne00, n_dims, nr, pos, freq_scale, ne01, freq_base, ext_factor,
-                attn_factor, corr_dims, freq_factors, main_stream
-            );
+            rope_norm_sycl((const sycl::half *) dst->src[0]->data, (sycl::half *) dst->data, ne00, ne01, s01, s02,
+                           n_dims, nr, pos, freq_scale, freq_base, ext_factor, attn_factor, corr_dims, freq_factors,
+                           main_stream);
         } else {
             GGML_ABORT("fatal error");
         }
     }
 }
+
+void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s\n", __func__);
+    ggml_sycl_op_rope(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
diff --git a/ggml/src/ggml-sycl/rope.hpp b/ggml/src/ggml-sycl/rope.hpp
index a399bddb8a07b..8c7141aac5c9b 100644
--- a/ggml/src/ggml-sycl/rope.hpp
+++ b/ggml/src/ggml-sycl/rope.hpp
@@ -15,6 +15,6 @@
 
 #include "common.hpp"
 
-void ggml_sycl_op_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
+void ggml_sycl_rope(ggml_backend_sycl_context & ctx, ggml_tensor *dst);
 
 #endif // GGML_SYCL_ROPE_HPP

From 1d735c0b4fa0551c51c2f4ac888dd9a01f447985 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Mon, 21 Apr 2025 18:13:51 +0200
Subject: [PATCH 007/200] ggml : add SSE 4.2 and x64 base variant for CPUs
 without AVX (#12871)

* ggml : add SSE 4.2 variant for CPUs without AVX

* ggml : add x64 base ABI variant
---
 ggml/CMakeLists.txt                 |  1 +
 ggml/src/CMakeLists.txt             | 15 +++++++++------
 ggml/src/ggml-cpu/CMakeLists.txt    |  8 +++++---
 ggml/src/ggml-cpu/cpu-feats-x86.cpp |  2 +-
 4 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 438c2a7309191..61fe15a15f074 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -107,6 +107,7 @@ message(DEBUG "INS_ENB             : ${INS_ENB}")
 option(GGML_CPU_HBM          "ggml: use memkind for CPU HBM" OFF)
 option(GGML_CPU_AARCH64      "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
 option(GGML_CPU_KLEIDIAI     "ggml: use KleidiAI optimized kernels if applicable" OFF)
+option(GGML_SSE42            "ggml: enable SSE 4.2"          ${INS_ENB})
 option(GGML_AVX              "ggml: enable AVX"              ${INS_ENB})
 option(GGML_AVX_VNNI         "ggml: enable AVX-VNNI"         OFF)
 option(GGML_AVX2             "ggml: enable AVX2"             ${INS_ENB})
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index f00700da71fcd..43d9fc4fe25e0 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -267,6 +267,7 @@ function(ggml_add_cpu_backend_variant tag_name)
     set(GGML_CPU_TAG_NAME ${tag_name})
     # other: OPENMP LLAMAFILE CPU_HBM
     foreach (feat NATIVE
+                  SSE42
                   AVX AVX2 BMI2 AVX_VNNI FMA F16C
                   AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16
                   AMX_TILE AMX_INT8 AMX_BF16)
@@ -286,14 +287,16 @@ if (GGML_CPU_ALL_VARIANTS)
     if (NOT GGML_BACKEND_DL)
         message(FATAL_ERROR "GGML_CPU_ALL_VARIANTS requires GGML_BACKEND_DL")
     endif()
-    ggml_add_cpu_backend_variant(sandybridge    AVX)
-    ggml_add_cpu_backend_variant(haswell        AVX F16C AVX2 BMI2 FMA)
-    ggml_add_cpu_backend_variant(skylakex       AVX F16C AVX2 BMI2 FMA AVX512)
-    ggml_add_cpu_backend_variant(icelake        AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
-    ggml_add_cpu_backend_variant(alderlake      AVX F16C AVX2 BMI2 FMA AVX_VNNI)
+    ggml_add_cpu_backend_variant(x64)
+    ggml_add_cpu_backend_variant(sse42        SSE42)
+    ggml_add_cpu_backend_variant(sandybridge  SSE42 AVX)
+    ggml_add_cpu_backend_variant(haswell      SSE42 AVX F16C AVX2 BMI2 FMA)
+    ggml_add_cpu_backend_variant(skylakex     SSE42 AVX F16C AVX2 BMI2 FMA AVX512)
+    ggml_add_cpu_backend_variant(icelake      SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI)
+    ggml_add_cpu_backend_variant(alderlake    SSE42 AVX F16C AVX2 BMI2 FMA AVX_VNNI)
     if (NOT MSVC)
         # MSVC doesn't support AMX
-        ggml_add_cpu_backend_variant(sapphirerapids AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
+        ggml_add_cpu_backend_variant(sapphirerapids SSE42 AVX F16C AVX2 BMI2 FMA AVX512 AVX512_VBMI AVX512_VNNI AVX512_BF16 AMX_TILE AMX_INT8)
     endif()
 elseif (GGML_CPU)
     ggml_add_cpu_backend_variant_impl("")
diff --git a/ggml/src/ggml-cpu/CMakeLists.txt b/ggml/src/ggml-cpu/CMakeLists.txt
index e73a3b69b5da2..6a652738c10a9 100644
--- a/ggml/src/ggml-cpu/CMakeLists.txt
+++ b/ggml/src/ggml-cpu/CMakeLists.txt
@@ -222,7 +222,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
             elseif (GGML_AVX)
                 list(APPEND ARCH_FLAGS /arch:AVX)
                 list(APPEND ARCH_DEFINITIONS GGML_AVX)
-            else ()
+            elseif (GGML_SSE42)
                 list(APPEND ARCH_FLAGS /arch:SSE4.2)
                 list(APPEND ARCH_DEFINITIONS GGML_SSE42)
             endif()
@@ -237,8 +237,10 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
             if (GGML_NATIVE)
                 list(APPEND ARCH_FLAGS -march=native)
             else ()
-                list(APPEND ARCH_FLAGS -msse4.2)
-                list(APPEND ARCH_DEFINITIONS GGML_SSE42)
+                if (GGML_SSE42)
+                    list(APPEND ARCH_FLAGS -msse4.2)
+                    list(APPEND ARCH_DEFINITIONS GGML_SSE42)
+                endif()
                 if (GGML_F16C)
                     list(APPEND ARCH_FLAGS -mf16c)
                     list(APPEND ARCH_DEFINITIONS GGML_F16C)
diff --git a/ggml/src/ggml-cpu/cpu-feats-x86.cpp b/ggml/src/ggml-cpu/cpu-feats-x86.cpp
index 902ee4346660c..d775a0363858d 100644
--- a/ggml/src/ggml-cpu/cpu-feats-x86.cpp
+++ b/ggml/src/ggml-cpu/cpu-feats-x86.cpp
@@ -263,7 +263,7 @@ void test_x86_is() {
 static int ggml_backend_cpu_x86_score() {
     // FIXME: this does not check for OS support
 
-    int score = 0;
+    int score = 1;
     cpuid_x86 is;
 
 #ifdef GGML_FMA

From 243453533e029334181dda50d911d5fc5a2b2486 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Tue, 22 Apr 2025 10:37:00 +0200
Subject: [PATCH 008/200] llava : update documentations (#13055)

* llava : update documentations

* fix typo
---
 common/arg.cpp                                |   3 +-
 .../multimodal/MobileVLM.md                   |  26 +-
 .../multimodal/gemma3.md                      |   7 +-
 .../multimodal/glmedge.md                     |   6 +-
 .../multimodal/granitevision.md               |   8 +-
 docs/multimodal/llava.md                      | 143 ++++++++
 .../multimodal/minicpmo2.6.md                 |   8 +-
 .../multimodal/minicpmv2.5.md                 |   8 +-
 .../multimodal/minicpmv2.6.md                 |   8 +-
 examples/llava/README.md                      | 177 ++--------
 examples/llava/android/adb_run.sh             |   2 +-
 .../llava/gemma3_convert_encoder_to_gguf.py   | 307 ------------------
 12 files changed, 212 insertions(+), 491 deletions(-)
 rename examples/llava/MobileVLM-README.md => docs/multimodal/MobileVLM.md (96%)
 rename examples/llava/README-gemma3.md => docs/multimodal/gemma3.md (82%)
 rename examples/llava/README-glmedge.md => docs/multimodal/glmedge.md (80%)
 rename examples/llava/README-granitevision.md => docs/multimodal/granitevision.md (92%)
 create mode 100644 docs/multimodal/llava.md
 rename examples/llava/README-minicpmo2.6.md => docs/multimodal/minicpmo2.6.md (73%)
 rename examples/llava/README-minicpmv2.5.md => docs/multimodal/minicpmv2.5.md (72%)
 rename examples/llava/README-minicpmv2.6.md => docs/multimodal/minicpmv2.6.md (71%)
 delete mode 100644 examples/llava/gemma3_convert_encoder_to_gguf.py

diff --git a/common/arg.cpp b/common/arg.cpp
index 80c318a0e50d0..1cfd0168d95ae 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -976,14 +976,13 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
         "llama-gritlm",
         "llama-imatrix",
         "llama-infill",
-        "llama-llava-cli",
+        "llama-mtmd-cli",
         "llama-llava-clip-quantize-cli",
         "llama-lookahead",
         "llama-lookup",
         "llama-lookup-create",
         "llama-lookup-merge",
         "llama-lookup-stats",
-        "llama-minicpmv-cli",
         "llama-parallel",
         "llama-passkey",
         "llama-perplexity",
diff --git a/examples/llava/MobileVLM-README.md b/docs/multimodal/MobileVLM.md
similarity index 96%
rename from examples/llava/MobileVLM-README.md
rename to docs/multimodal/MobileVLM.md
index 4f783f3ce05fb..20ac02f7a8dfc 100644
--- a/examples/llava/MobileVLM-README.md
+++ b/docs/multimodal/MobileVLM.md
@@ -9,15 +9,15 @@ The implementation is based on llava, and is compatible with llava and mobileVLM
 Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using **MobileVLM-1.7B** as an example, the different conversion step will be shown.
 
 ## Usage
-Build with cmake or run `make llama-llava-cli` to build it.
 
-After building, run: `./llama-llava-cli` to see the usage. For example:
+Build the `llama-mtmd-cli` binary.
+
+After building, run: `./llama-mtmd-cli` to see the usage. For example:
 
 ```sh
-./llama-llava-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
+./llama-mtmd-cli -m MobileVLM-1.7B/ggml-model-q4_k.gguf \
     --mmproj MobileVLM-1.7B/mmproj-model-f16.gguf \
-    --image path/to/an/image.jpg \
-    -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWho is the author of this book? Answer the question using a single word or phrase. ASSISTANT:"
+    --chat-template deepseek
 ```
 
 ## Model conversion
@@ -82,7 +82,7 @@ refer to `android/adb_run.sh`, modify resources' `name` and `path`
 ### case 1
 **input**
 ```sh
-/data/local/tmp/llama-llava-cli \
+/data/local/tmp/llama-mtmd-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -t 4 \
@@ -102,7 +102,7 @@ llama_print_timings:       total time =   34731.93 ms
 ### case 2
 **input**
 ```sh
-/data/local/tmp/llama-llava-cli \
+/data/local/tmp/llama-mtmd-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -t 4 \
@@ -123,10 +123,10 @@ llama_print_timings:       total time =   34570.79 ms
 
 ## Some result on Android with `Snapdragon 778G` chip
 ### MobileVLM-1.7B case
-#### llava-cli release-b2005
+#### mtmd-cli release-b2005
 **input**
 ```sh
-/data/local/tmp/llama-llava-cli \
+/data/local/tmp/llama-mtmd-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -t 4 \
@@ -147,7 +147,7 @@ llama_print_timings: prompt eval time =    8119.49 ms /   191 tokens (   42.51 m
 llama_print_timings:        eval time =    1005.75 ms /    14 runs   (   71.84 ms per token,    13.92 tokens per second)
 llama_print_timings:       total time =   28038.34 ms /   205 tokens
 ```
-#### llava-cli latest-version
+#### mtmd-cli latest-version
 **input**
 
 Just the same as above.
@@ -169,7 +169,7 @@ llama_print_timings:        eval time =   43894.02 ms /    13 runs   ( 3376.46 m
 llama_print_timings:       total time =  865441.76 ms /   204 tokens
 ```
 ### MobileVLM_V2-1.7B case
-#### llava-cli release-2005b
+#### mtmd-cli release-2005b
 **input**
 
 Just the same as above.
@@ -200,7 +200,7 @@ make GGML_CUDA=1 CUDA_DOCKER_ARCH=sm_87 GGML_CUDA_F16=1 -j 32
 ### case 1
 **input**
 ```sh
-./llama-llava-cli \
+./llama-mtmd-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     --image /data/local/tmp/demo.jpeg \
@@ -224,7 +224,7 @@ llama_print_timings:       total time =    1352.63 ms /   252 tokens
 ### case 2
 **input**
 ```sh
-./llama-llava-cli \
+./llama-mtmd-cli \
     -m /data/local/tmp/ggml-model-q4_k.gguf \
     --mmproj /data/local/tmp/mmproj-model-f16.gguf \
     -p "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:" \
diff --git a/examples/llava/README-gemma3.md b/docs/multimodal/gemma3.md
similarity index 82%
rename from examples/llava/README-gemma3.md
rename to docs/multimodal/gemma3.md
index 3c25ee2583027..8fa077de71985 100644
--- a/examples/llava/README-gemma3.md
+++ b/docs/multimodal/gemma3.md
@@ -26,11 +26,12 @@ llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF
 
 ## How to get mmproj.gguf?
 
+Simply to add `--mmproj` in when converting model via `convert_hf_to_gguf.py`:
+
 ```bash
 cd gemma-3-4b-it
-python ../llama.cpp/examples/llava/gemma3_convert_encoder_to_gguf.py .
-
-# output file is mmproj.gguf
+python ../llama.cpp/convert_hf_to_gguf.py --outfile model.gguf --outtype f16 --mmproj .
+# output file: mmproj-model.gguf
 ```
 
 ## How to run it?
diff --git a/examples/llava/README-glmedge.md b/docs/multimodal/glmedge.md
similarity index 80%
rename from examples/llava/README-glmedge.md
rename to docs/multimodal/glmedge.md
index 603d01474513f..af6b696a8ad27 100644
--- a/examples/llava/README-glmedge.md
+++ b/docs/multimodal/glmedge.md
@@ -3,12 +3,12 @@
 Currently this implementation supports [glm-edge-v-2b](https://huggingface.co/THUDM/glm-edge-v-2b) and [glm-edge-v-5b](https://huggingface.co/THUDM/glm-edge-v-5b).
 
 ## Usage
-Build with cmake or run `make llama-llava-cli` to build it.
+Build the `llama-mtmd-cli` binary.
 
-After building, run: `./llama-llava-cli` to see the usage. For example:
+After building, run: `./llama-mtmd-cli` to see the usage. For example:
 
 ```sh
-./llama-llava-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf --image img_path/image.jpg -p "<|system|>\n system prompt <image><|user|>\n prompt <|assistant|>\n"
+./llama-mtmd-cli -m model_path/ggml-model-f16.gguf --mmproj model_path/mmproj-model-f16.gguf
 ```
 
 **note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
diff --git a/examples/llava/README-granitevision.md b/docs/multimodal/granitevision.md
similarity index 92%
rename from examples/llava/README-granitevision.md
rename to docs/multimodal/granitevision.md
index f08a21cc175b4..3118fe0cdc113 100644
--- a/examples/llava/README-granitevision.md
+++ b/docs/multimodal/granitevision.md
@@ -176,15 +176,11 @@ Note that currently you cannot quantize the visual encoder because granite visio
 
 
 ### 5. Running the Model in Llama cpp
-Build llama cpp normally; you should have a target binary named `llama-llava-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
+Build llama cpp normally; you should have a target binary named `llama-mtmd-cli`, which you can pass two binaries to. As an example, we pass the the llama.cpp banner.
 
 ```bash
-$ ./build/bin/llama-llava-cli -m $LLM_GGUF_PATH \
+$ ./build/bin/llama-mtmd-cli -m $LLM_GGUF_PATH \
     --mmproj $VISUAL_GGUF_PATH \
-    --image ./media/llama0-banner.png \
     -c 16384 \
-    -p "<|system|>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n<|user|>\n\<image>\nWhat does the text in this image say?\n<|assistant|>\n" \
     --temp 0
 ```
-
-Sample output: `The text in the image reads "LLAMA C++ Can it run DOOM Llama?"`
diff --git a/docs/multimodal/llava.md b/docs/multimodal/llava.md
new file mode 100644
index 0000000000000..c5bdc82158ede
--- /dev/null
+++ b/docs/multimodal/llava.md
@@ -0,0 +1,143 @@
+# LLaVA
+
+Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants,
+as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants.
+
+The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
+and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
+models are available.
+For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf)
+
+After API is confirmed, more models will be supported / uploaded.
+
+## Usage
+Build the `llama-mtmd-cli` binary.
+
+After building, run: `./llama-mtmd-cli` to see the usage. For example:
+
+```sh
+./llama-mtmd-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf \
+    --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf \
+    --chat-template vicuna
+```
+
+**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
+**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
+
+## LLaVA 1.5
+
+1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
+
+```sh
+git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
+
+git clone https://huggingface.co/openai/clip-vit-large-patch14-336
+```
+
+2. Install the required Python packages:
+
+```sh
+pip install -r examples/llava/requirements.txt
+```
+
+3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
+
+```sh
+python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
+```
+
+4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
+
+```sh
+python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
+```
+
+5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
+
+```sh
+python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown
+```
+
+Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
+
+## LLaVA 1.6 gguf conversion
+1) First clone a LLaVA 1.6 model:
+```console
+git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
+```
+
+2) Install the required Python packages:
+
+```sh
+pip install -r examples/llava/requirements.txt
+```
+
+3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
+```console
+python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
+```
+- you will find a llava.projector and a llava.clip file in your model directory
+
+4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
+```console
+mkdir vit
+cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
+cp ../llava-v1.6-vicuna-7b/llava.projector vit/
+curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
+```
+
+5) Create the visual gguf model:
+```console
+python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
+```
+- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
+
+6) Then convert the model to gguf format:
+```console
+python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
+```
+
+7) And finally we can run the llava cli using the 1.6 model version:
+```console
+./llama-mtmd-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf
+```
+
+**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
+
+**note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
+
+**note** if the language model in step `6)` is incompatible with the legacy conversion script, the easiest way handle the LLM model conversion is to load the model in transformers, and export only the LLM from the llava next model.
+
+```python
+import os
+import transformers
+
+model_path = ...
+llm_export_path = ...
+
+tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
+model = transformers.AutoModelForImageTextToText.from_pretrained(model_path)
+
+tokenizer.save_pretrained(llm_export_path)
+model.language_model.save_pretrained(llm_export_path)
+```
+
+Then, you can convert the LLM using the `convert_hf_to_gguf.py` script, which handles more LLM architectures.
+
+## Chat template
+
+For llava-1.5 and llava-1.6, you need to use `vicuna` chat template. Simply add `--chat-template vicuna` to activate this template.
+
+
+## How to know if you are running in llava-1.5 or llava-1.6 mode
+
+When running llava-cli you will see a visual information right before the prompt is being processed:
+
+**Llava-1.5:**
+`encode_image_with_clip: image embedding created: 576 tokens`
+
+**Llava-1.6 (anything above 576):**
+`encode_image_with_clip: image embedding created: 2880 tokens`
+
+
+Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6
diff --git a/examples/llava/README-minicpmo2.6.md b/docs/multimodal/minicpmo2.6.md
similarity index 73%
rename from examples/llava/README-minicpmo2.6.md
rename to docs/multimodal/minicpmo2.6.md
index 48c423238395b..de470d8a82cc6 100644
--- a/examples/llava/README-minicpmo2.6.md
+++ b/docs/multimodal/minicpmo2.6.md
@@ -40,9 +40,9 @@ python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model
 
 Inference on Linux or Mac
 ```bash
-# run f16 version
-./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+# run in single-turn mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
 
-# run quantized int4 version
-./build/bin/llama-minicpmv-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+# run in conversation mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-o-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-2_6/mmproj-model-f16.gguf
 ```
diff --git a/examples/llava/README-minicpmv2.5.md b/docs/multimodal/minicpmv2.5.md
similarity index 72%
rename from examples/llava/README-minicpmv2.5.md
rename to docs/multimodal/minicpmv2.5.md
index 6bfe7abd16487..7a6879d3959ca 100644
--- a/examples/llava/README-minicpmv2.5.md
+++ b/docs/multimodal/minicpmv2.5.md
@@ -39,9 +39,9 @@ python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model
 
 Inference on Linux or Mac
 ```bash
-# run f16 version
-./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+# run in single-turn mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/model-8B-F16.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
 
-# run quantized int4 version
-./build/bin/llama-minicpmv-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+# run in conversation mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-Llama3-V-2_5/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-Llama3-V-2_5/mmproj-model-f16.gguf
 ```
diff --git a/examples/llava/README-minicpmv2.6.md b/docs/multimodal/minicpmv2.6.md
similarity index 71%
rename from examples/llava/README-minicpmv2.6.md
rename to docs/multimodal/minicpmv2.6.md
index 2df39cdbac78a..410a5dd1771e4 100644
--- a/examples/llava/README-minicpmv2.6.md
+++ b/docs/multimodal/minicpmv2.6.md
@@ -39,9 +39,9 @@ python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model
 
 Inference on Linux or Mac
 ```bash
-# run f16 version
-./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
+# run in single-turn mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?"
 
-# run quantized int4 version
-./build/bin/llama-minicpmv-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg  -p "What is in the image?"
+# run in conversation mode
+./build/bin/llama-mtmd-cli -m ../MiniCPM-V-2_6/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-2_6/mmproj-model-f16.gguf
 ```
diff --git a/examples/llava/README.md b/examples/llava/README.md
index 0e3c32032055b..cadbc53fab0d7 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -1,158 +1,47 @@
-# LLaVA
+# Multimodal Support in llama.cpp
 
-Currently this implementation supports [llava-v1.5](https://huggingface.co/liuhaotian/llava-v1.5-7b) variants,
-as well as llava-1.6 [llava-v1.6](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2) variants.
+This directory provides multimodal capabilities for `llama.cpp`. Initially intended as a showcase for running LLaVA models, its scope has expanded significantly over time to include various other vision-capable models. As a result, LLaVA is no longer the only multimodal architecture supported.
 
-The pre-converted [7b](https://huggingface.co/mys/ggml_llava-v1.5-7b)
-and [13b](https://huggingface.co/mys/ggml_llava-v1.5-13b)
-models are available.
-For llava-1.6 a variety of prepared gguf models are available as well [7b-34b](https://huggingface.co/cmp-nct/llava-1.6-gguf)
+> [!IMPORTANT]
+>
+> Multimodal support can be viewed as a sub-project within `llama.cpp`. It is under **very heavy development**, and **breaking changes are expected**.
 
-After API is confirmed, more models will be supported / uploaded.
+The naming and structure related to multimodal support have evolved, which might cause some confusion. Here's a brief timeline to clarify:
 
-## Usage
-Build with cmake or run `make llama-llava-cli` to build it.
+- [#3436](https://github.com/ggml-org/llama.cpp/pull/3436): Initial support for LLaVA 1.5 was added, introducing `llava.cpp` and `clip.cpp`. The `llava-cli` binary was created for model interaction.
+- [#4954](https://github.com/ggml-org/llama.cpp/pull/4954): Support for MobileVLM was added, becoming the second vision model supported. This built upon the existing `llava.cpp`, `clip.cpp`, and `llava-cli` infrastructure.
+- **Expansion & Fragmentation:** Many new models were subsequently added (e.g., [#7599](https://github.com/ggml-org/llama.cpp/pull/7599), [#10361](https://github.com/ggml-org/llama.cpp/pull/10361), [#12344](https://github.com/ggml-org/llama.cpp/pull/12344), and others). However, `llava-cli` lacked support for the increasingly complex chat templates required by these models. This led to the creation of model-specific binaries like `qwen2vl-cli`, `minicpmv-cli`, and `gemma3-cli`. While functional, this proliferation of command-line tools became confusing for users.
+- [#12849](https://github.com/ggml-org/llama.cpp/pull/12849): `libmtmd` was introduced as a replacement for `llava.cpp`. Its goals include providing a single, unified command-line interface, improving the user/developer experience (UX/DX), and supporting both audio and image inputs.
+- [#13012](https://github.com/ggml-org/llama.cpp/pull/13012): `mtmd-cli` was added, consolidating the various model-specific CLIs into a single tool powered by `libmtmd`.
 
-After building, run: `./llama-llava-cli` to see the usage. For example:
+## How it works and what is `mmproj`?
 
-```sh
-./llama-llava-cli -m ../llava-v1.5-7b/ggml-model-f16.gguf --mmproj ../llava-v1.5-7b/mmproj-model-f16.gguf --image path/to/an/image.jpg
-```
+Multimodal support in `llama.cpp` works by encoding images into embeddings using a separate model component, and then feeding these embeddings into the language model.
 
-**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
-**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
+This approach keeps the multimodal components distinct from the core `libllama` library. Separating these allows for faster, independent development cycles. While many modern vision models are based on Vision Transformers (ViTs), their specific pre-processing and projection steps can vary significantly. Integrating this diverse complexity directly into `libllama` is currently challenging.
 
-## LLaVA 1.5
+Consequently, running a multimodal model typically requires two GGUF files:
+1.  The standard language model file.
+2.  A corresponding **multimodal projector (`mmproj`)** file, which handles the image encoding and projection.
 
-1. Clone a LLaVA and a CLIP model ([available options](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)). For example:
+## What is `libmtmd`?
 
-```sh
-git clone https://huggingface.co/liuhaotian/llava-v1.5-7b
+As outlined in the history, `libmtmd` is the modern library designed to replace the original `llava.cpp` implementation for handling multimodal inputs.
 
-git clone https://huggingface.co/openai/clip-vit-large-patch14-336
-```
+Built upon `clip.cpp` (similar to `llava.cpp`), `libmtmd` offers several advantages:
+- **Unified Interface:** Aims to consolidate interaction for various multimodal models.
+- **Improved UX/DX:** Features a more intuitive API, inspired by the `Processor` class in the Hugging Face `transformers` library.
+- **Flexibility:** Designed to support multiple input types (text, audio, images) while respecting the wide variety of chat templates used by different models.
 
-2. Install the required Python packages:
+## How to obtain `mmproj`
 
-```sh
-pip install -r examples/llava/requirements.txt
-```
+Multimodal projector (`mmproj`) files are specific to each model architecture. Please refer to the relevant guide for instructions on how to obtain or create them:
 
-3. Use `llava_surgery.py` to split the LLaVA model to LLaMA and multimodel projector constituents:
-
-```sh
-python ./examples/llava/llava_surgery.py -m ../llava-v1.5-7b
-```
-
-4. Use `convert_image_encoder_to_gguf.py` to convert the LLaVA image encoder to GGUF:
-
-```sh
-python ./examples/llava/convert_image_encoder_to_gguf.py -m ../clip-vit-large-patch14-336 --llava-projector ../llava-v1.5-7b/llava.projector --output-dir ../llava-v1.5-7b
-```
-
-5. Use `examples/convert_legacy_llama.py` to convert the LLaMA part of LLaVA to GGUF:
-
-```sh
-python ./examples/convert_legacy_llama.py ../llava-v1.5-7b --skip-unknown
-```
-
-Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
-
-## LLaVA 1.6 gguf conversion
-1) First clone a LLaVA 1.6 model:
-```console
-git clone https://huggingface.co/liuhaotian/llava-v1.6-vicuna-7b
-```
-
-2) Install the required Python packages:
-
-```sh
-pip install -r examples/llava/requirements.txt
-```
-
-3) Use `llava_surgery_v2.py` which also supports llava-1.5 variants pytorch as well as safetensor models:
-```console
-python examples/llava/llava_surgery_v2.py -C -m ../llava-v1.6-vicuna-7b/
-```
-- you will find a llava.projector and a llava.clip file in your model directory
-
-4) Copy the llava.clip file into a subdirectory (like vit), rename it to pytorch_model.bin and add a fitting vit configuration to the directory:
-```console
-mkdir vit
-cp ../llava-v1.6-vicuna-7b/llava.clip vit/pytorch_model.bin
-cp ../llava-v1.6-vicuna-7b/llava.projector vit/
-curl -s -q https://huggingface.co/cmp-nct/llava-1.6-gguf/raw/main/config_vit.json -o vit/config.json
-```
-
-5) Create the visual gguf model:
-```console
-python ./examples/llava/convert_image_encoder_to_gguf.py -m vit --llava-projector vit/llava.projector --output-dir vit --clip-model-is-vision
-```
-- This is similar to llava-1.5, the difference is that we tell the encoder that we are working with the pure vision model part of CLIP
-
-6) Then convert the model to gguf format:
-```console
-python ./examples/convert_legacy_llama.py ../llava-v1.6-vicuna-7b/ --skip-unknown
-```
-
-7) And finally we can run the llava cli using the 1.6 model version:
-```console
-./llama-llava-cli -m ../llava-v1.6-vicuna-7b/ggml-model-f16.gguf --mmproj vit/mmproj-model-f16.gguf --image some-image.jpg -c 4096
-```
-
-**note** llava-1.6 needs more context than llava-1.5, at least 3000 is needed (just run it at -c 4096)
-
-**note** llava-1.6 greatly benefits from batched prompt processing (defaults work)
-
-**note** if the language model in step `6)` is incompatible with the legacy conversion script, the easiest way handle the LLM model conversion is to load the model in transformers, and export only the LLM from the llava next model.
-
-```python
-import os
-import transformers
-
-model_path = ...
-llm_export_path = ...
-
-tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
-model = transformers.AutoModelForImageTextToText.from_pretrained(model_path)
-
-tokenizer.save_pretrained(llm_export_path)
-model.language_model.save_pretrained(llm_export_path)
-```
-
-Then, you can convert the LLM using the `convert_hf_to_gguf.py` script, which handles more LLM architectures.
-
-## llava-cli templating and llava-1.6 prompting
-
-llava-1.5 models all use the same vicuna prompt, here you can just add your image question like `-p "Provide a full description."`
-For llava-1.5 models which are not vicuna (mistral and Yi) you need to adapt system prompt as well as user prompt, for this purpose llava-cli has a basic templating system:
-
-**For Mistral and using llava-cli binary:**
-Add this: `-p "<image>\nUSER:\nProvide a full description.\nASSISTANT:\n"`
-The mistral template for llava-1.6 seems to be no system print and a USER/ASSISTANT role
-
-**For the 34B this should work:**
-Add this: `-e -p <|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nProvide a full description.<|im_end|><|im_start|>assistant\n`
-
-
-## How to know if you are running in llava-1.5 or llava-1.6 mode
-
-When running llava-cli you will see a visual information right before the prompt is being processed:
-
-**Llava-1.5:**
-`encode_image_with_clip: image embedding created: 576 tokens`
-
-**Llava-1.6 (anything above 576):**
-`encode_image_with_clip: image embedding created: 2880 tokens`
-
-
-Alternatively just pay notice to how many "tokens" have been used for your prompt, it will also show 1000+ tokens for llava-1.6
-
-
-
-
-## TODO
-
-- [x] Support non-CPU backend for the image encoding part.
-- [ ] Support different sampling methods.
-- [ ] Support more model variants.
+- [LLaVA](../../docs/multimodal/llava.md)
+- [MobileVLM](../../docs/multimodal/MobileVLM.md)
+- [GLM-Edge](../../docs/multimodal/glmedge.md)
+- [MiniCPM-V 2.5](../../docs/multimodal/minicpmv2.5.md)
+- [MiniCPM-V 2.6](../../docs/multimodal/minicpmv2.6.md)
+- [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md)
+- [IBM Granite Vision](../../docs/multimodal/granitevision.md)
+- [Google Gemma 3](../../docs/multimodal/gemma3.md)
diff --git a/examples/llava/android/adb_run.sh b/examples/llava/android/adb_run.sh
index 45ccf8d70d863..a24d6787d9a05 100755
--- a/examples/llava/android/adb_run.sh
+++ b/examples/llava/android/adb_run.sh
@@ -10,7 +10,7 @@ prompt="A chat between a curious user and an artificial intelligence assistant.
 # prompt="A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\nWhat is in the image? ASSISTANT:"
 
 program_dir="build_64/bin"
-binName="llama-llava-cli"
+binName="llama-mtmd-cli"
 n_threads=4
 
 
diff --git a/examples/llava/gemma3_convert_encoder_to_gguf.py b/examples/llava/gemma3_convert_encoder_to_gguf.py
deleted file mode 100644
index 241b526b9ede7..0000000000000
--- a/examples/llava/gemma3_convert_encoder_to_gguf.py
+++ /dev/null
@@ -1,307 +0,0 @@
-import gguf
-import argparse
-import logging
-import sys
-import torch
-import json
-import os
-import numpy as np
-from typing import cast, ContextManager, Any, Iterator
-from pathlib import Path
-from torch import Tensor
-
-logger = logging.getLogger("gemma3-mmproj")
-
-
-# (copied from convert_hf_to_gguf.py)
-# tree of lazy tensors
-class LazyTorchTensor(gguf.LazyBase):
-    _tensor_type = torch.Tensor
-    # to keep the type-checker happy
-    dtype: torch.dtype
-    shape: torch.Size
-
-    # only used when converting a torch.Tensor to a np.ndarray
-    _dtype_map: dict[torch.dtype, type] = {
-        torch.float16: np.float16,
-        torch.float32: np.float32,
-    }
-
-    # used for safetensors slices
-    # ref: https://github.com/huggingface/safetensors/blob/079781fd0dc455ba0fe851e2b4507c33d0c0d407/bindings/python/src/lib.rs#L1046
-    # TODO: uncomment U64, U32, and U16, ref: https://github.com/pytorch/pytorch/issues/58734
-    _dtype_str_map: dict[str, torch.dtype] = {
-        "F64": torch.float64,
-        "F32": torch.float32,
-        "BF16": torch.bfloat16,
-        "F16": torch.float16,
-        # "U64": torch.uint64,
-        "I64": torch.int64,
-        # "U32": torch.uint32,
-        "I32": torch.int32,
-        # "U16": torch.uint16,
-        "I16": torch.int16,
-        "U8": torch.uint8,
-        "I8": torch.int8,
-        "BOOL": torch.bool,
-        "F8_E4M3": torch.float8_e4m3fn,
-        "F8_E5M2": torch.float8_e5m2,
-    }
-
-    def numpy(self) -> gguf.LazyNumpyTensor:
-        dtype = self._dtype_map[self.dtype]
-        return gguf.LazyNumpyTensor(
-            meta=gguf.LazyNumpyTensor.meta_with_dtype_and_shape(dtype, self.shape),
-            args=(self,),
-            func=(lambda s: s.numpy())
-        )
-
-    @classmethod
-    def meta_with_dtype_and_shape(cls, dtype: torch.dtype, shape: tuple[int, ...]) -> Tensor:
-        return torch.empty(size=shape, dtype=dtype, device="meta")
-
-    @classmethod
-    def from_safetensors_slice(cls, st_slice: Any) -> Tensor:
-        dtype = cls._dtype_str_map[st_slice.get_dtype()]
-        shape: tuple[int, ...] = tuple(st_slice.get_shape())
-        lazy = cls(meta=cls.meta_with_dtype_and_shape(dtype, shape), args=(st_slice,), func=lambda s: s[:])
-        return cast(torch.Tensor, lazy)
-
-    @classmethod
-    def __torch_function__(cls, func, types, args=(), kwargs=None):
-        del types  # unused
-
-        if kwargs is None:
-            kwargs = {}
-
-        if func is torch.Tensor.numpy:
-            return args[0].numpy()
-
-        return cls._wrap_fn(func)(*args, **kwargs)
-
-
-class Gemma3VisionTower:
-    hparams: dict
-    gguf_writer: gguf.GGUFWriter
-    fname_out: Path
-    ftype: gguf.LlamaFileType
-
-    @staticmethod
-    def load_hparams(dir_model: Path):
-        with open(dir_model / "config.json", "r", encoding="utf-8") as f:
-            return json.load(f)
-
-    @staticmethod
-    def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]:
-        part_names: list[str] = []
-        for filename in os.listdir(dir_model):
-            if filename.startswith(prefix) and filename.endswith(suffix):
-                part_names.append(filename)
-        part_names.sort()
-        return part_names
-
-    def __init__(self,
-                 dir_model: Path,
-                 fname_out: Path,
-                 ftype: gguf.LlamaFileType,
-                 is_big_endian: bool,):
-        hparams = Gemma3VisionTower.load_hparams(dir_model)
-        self.hparams = hparams
-        self.fname_out = fname_out
-        self.ftype = ftype
-        endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
-        self.gguf_writer = gguf.GGUFWriter(path=None, arch="clip", endianess=endianess)
-
-        text_config = hparams["text_config"]
-        vision_config = hparams["vision_config"]
-
-        assert hparams["architectures"][0] == "Gemma3ForConditionalGeneration"
-        assert text_config is not None
-        assert vision_config is not None
-
-        self.gguf_writer.add_string ("clip.projector_type",              "gemma3")
-        self.gguf_writer.add_bool   ("clip.has_text_encoder",            False)
-        self.gguf_writer.add_bool   ("clip.has_vision_encoder",          True)
-        self.gguf_writer.add_bool   ("clip.has_llava_projector",         False) # legacy
-        self.gguf_writer.add_uint32 ("clip.vision.image_size",           vision_config["image_size"])
-        self.gguf_writer.add_uint32 ("clip.vision.patch_size",           vision_config["patch_size"])
-        self.gguf_writer.add_uint32 ("clip.vision.embedding_length",     vision_config["hidden_size"])
-        self.gguf_writer.add_uint32 ("clip.vision.feed_forward_length",  vision_config["intermediate_size"])
-        self.gguf_writer.add_uint32 ("clip.vision.projection_dim",       text_config["hidden_size"])
-        self.gguf_writer.add_uint32 ("clip.vision.block_count",          vision_config["num_hidden_layers"])
-        self.gguf_writer.add_uint32 ("clip.vision.attention.head_count", vision_config["num_attention_heads"])
-        self.gguf_writer.add_float32("clip.vision.attention.layer_norm_epsilon", vision_config.get("layer_norm_eps", 1e-6))
-        # default values taken from HF tranformers code
-        self.gguf_writer.add_array  ("clip.vision.image_mean", [0.5, 0.5, 0.5])
-        self.gguf_writer.add_array  ("clip.vision.image_std",  [0.5, 0.5, 0.5])
-        self.gguf_writer.add_bool   ("clip.use_gelu", True)
-
-        # load tensors
-        for name, data_torch in self.get_tensors(dir_model):
-            # convert any unsupported data types to float32
-            if data_torch.dtype not in (torch.float16, torch.float32):
-                data_torch = data_torch.to(torch.float32)
-            self.add_tensor(name, data_torch)
-
-    def get_tensors(self, dir_model: Path) -> Iterator[tuple[str, Tensor]]:
-        part_names = Gemma3VisionTower.get_model_part_names(dir_model, "model", ".safetensors")
-        tensor_names_from_parts: set[str] = set()
-        for part_name in part_names:
-            logger.info(f"gguf: loading model part '{part_name}'")
-            from safetensors import safe_open
-            ctx = cast(ContextManager[Any], safe_open(dir_model / part_name, framework="pt", device="cpu"))
-            with ctx as model_part:
-                tensor_names_from_parts.update(model_part.keys())
-
-                for name in model_part.keys():
-                    data = model_part.get_slice(name)
-                    data = LazyTorchTensor.from_safetensors_slice(data)
-                    yield name, data
-
-    def add_tensor(self, name: str, data_torch: Tensor):
-        is_1d = len(data_torch.shape) == 1
-        is_embd = ".embeddings." in name
-        old_dtype = data_torch.dtype
-        can_quantize = not is_1d and not is_embd
-        data_qtype = gguf.GGMLQuantizationType.F32
-
-        # this is to support old checkpoint
-        # TODO: remove this when we have the final model
-        name = name.replace("vision_model.vision_model.", "vision_tower.vision_model.")
-        name = name.replace("multimodal_projector.", "multi_modal_projector.")
-
-        # filter only vision tensors
-        if not name.startswith("vision_tower.vision_model.") and not name.startswith("multi_modal_projector."):
-            return
-        # prefix
-        name = name.replace("vision_tower.vision_model.encoder.layers.", "v.blk.")
-        name = name.replace("vision_tower.vision_model.", "v.")
-        # projector and input embd
-        name = name.replace(".embeddings.patch_embedding.", ".patch_embd.")
-        name = name.replace(".embeddings.position_embedding.", ".position_embd.")
-        name = name.replace(
-            "multi_modal_projector.mm_input_projection_weight",
-            "mm.input_projection.weight"
-        )
-        name = name.replace(
-            "multi_modal_projector.mm_soft_emb_norm.weight",
-            "mm.soft_emb_norm.weight"
-        )
-        name = name.replace("post_layernorm.", "post_ln.")
-        # each block
-        name = name.replace(".self_attn.k_proj.", ".attn_k.")
-        name = name.replace(".self_attn.v_proj.", ".attn_v.")
-        name = name.replace(".self_attn.q_proj.", ".attn_q.")
-        name = name.replace(".self_attn.out_proj.", ".attn_out.")
-        name = name.replace(".layer_norm1.", ".ln1.")
-        name = name.replace(".layer_norm2.", ".ln2.")
-        name = name.replace(".mlp.fc1.", ".ffn_down.")
-        name = name.replace(".mlp.fc2.", ".ffn_up.")
-
-        if can_quantize:
-            if self.ftype == gguf.LlamaFileType.ALL_F32:
-                data_qtype = gguf.GGMLQuantizationType.F32
-            elif self.ftype == gguf.LlamaFileType.MOSTLY_F16:
-                data_qtype = gguf.GGMLQuantizationType.F16
-            elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
-                data_qtype = gguf.GGMLQuantizationType.BF16
-            elif self.ftype == gguf.LlamaFileType.MOSTLY_Q8_0:
-                data_qtype = gguf.GGMLQuantizationType.Q8_0
-            else:
-                raise ValueError(f"Unsupported file type: {self.ftype}")
-
-        # corrent norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
-        # the other norm values are part of SigLIP model, and they are already correct
-        # ref code: Gemma3RMSNorm
-        if "soft_emb_norm.weight" in name:
-            logger.info(f"Correcting norm value for '{name}'")
-            data_torch = data_torch + 1
-
-        data = data_torch.numpy()
-
-        try:
-            data = gguf.quants.quantize(data, data_qtype)
-        except Exception as e:
-            logger.error(f"Error quantizing tensor '{name}': {e}, fallback to F16")
-            data_qtype = gguf.GGMLQuantizationType.F16
-            data = gguf.quants.quantize(data, data_qtype)
-
-        # reverse shape to make it similar to the internal ggml dimension order
-        shape_str = f"{{{', '.join(str(n) for n in reversed(data_torch.shape))}}}"
-        logger.info(f"{f'%-32s' % f'{name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
-
-        self.gguf_writer.add_tensor(name, data, raw_dtype=data_qtype)
-
-    def write(self):
-        self.gguf_writer.write_header_to_file(path=self.fname_out)
-        self.gguf_writer.write_kv_data_to_file()
-        self.gguf_writer.write_tensors_to_file(progress=True)
-        self.gguf_writer.close()
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="Convert Gemma 3 vision tower safetensors to GGUF format",)
-    parser.add_argument(
-        "--outfile", type=Path, default="mmproj.gguf",
-        help="path to write to",
-    )
-    parser.add_argument(
-        "--outtype", type=str, choices=["f32", "f16", "bf16", "q8_0"], default="f16",
-        help="output format",
-    )
-    parser.add_argument(
-        "--bigendian", action="store_true",
-        help="model is executed on big endian machine",
-    )
-    parser.add_argument(
-        "model", type=Path,
-        help="directory containing model file",
-        nargs="?",
-    )
-    parser.add_argument(
-        "--verbose", action="store_true",
-        help="increase output verbosity",
-    )
-
-    args = parser.parse_args()
-    if args.model is None:
-        parser.error("the following arguments are required: model")
-    return args
-
-
-def main() -> None:
-    args = parse_args()
-
-    if args.verbose:
-        logging.basicConfig(level=logging.DEBUG)
-    else:
-        logging.basicConfig(level=logging.INFO)
-
-    dir_model = args.model
-
-    if not dir_model.is_dir():
-        logger.error(f'Error: {args.model} is not a directory')
-        sys.exit(1)
-
-    ftype_map: dict[str, gguf.LlamaFileType] = {
-        "f32": gguf.LlamaFileType.ALL_F32,
-        "f16": gguf.LlamaFileType.MOSTLY_F16,
-        "bf16": gguf.LlamaFileType.MOSTLY_BF16,
-        "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
-    }
-
-    logger.info(f"Loading model: {dir_model.name}")
-
-    with torch.inference_mode():
-        gemma3_vision_tower = Gemma3VisionTower(
-            dir_model=dir_model,
-            fname_out=args.outfile,
-            ftype=ftype_map[args.outtype],
-            is_big_endian=args.bigendian,
-        )
-        gemma3_vision_tower.write()
-
-
-if __name__ == '__main__':
-    main()
-

From 7b53389c24a507564be39e1ea82746a39749059b Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 22 Apr 2025 16:15:51 +0300
Subject: [PATCH 009/200] metal : add memory pool for temp allocs (#12850)

* metal : add memory pool for temp allocs (wip) [no ci]

* cont : free buffers from the heap

* cont : resize heap [no ci]

* cont : refactor heap [no ci]

* cont : heap for each cmd buffer [no ci]

* cont : fix free

* wip

* cont : fix alignment [no ci]

* cont : not working .. [no ci]

* cont : heap allocation now works [no ci]

* cont : use MTLHeapTypePlacement

ggml-ci

* metal : use dynamic MTLHeap allocations

ggml-ci

* metal : add comments

* metal : disable softmax use of mem_pool

ggml-ci

* metal : final touches
---
 ggml/src/ggml-metal/ggml-metal.m | 406 ++++++++++++++++++++++++++++---
 1 file changed, 366 insertions(+), 40 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
index 266d8af4693c2..d92392edb7eb1 100644
--- a/ggml/src/ggml-metal/ggml-metal.m
+++ b/ggml/src/ggml-metal/ggml-metal.m
@@ -44,8 +44,8 @@
 // note: assumes single GPU device - the default one
 // TODO: support multiple GPU devices
 static struct ggml_backend_metal_device_context {
-    id<MTLDevice> mtl_device;
-    int           mtl_device_ref_count;
+    id<MTLDevice>  mtl_device;
+    int            mtl_device_ref_count;
     id<MTLLibrary> mtl_library;
 
     bool has_simdgroup_reduction;
@@ -490,7 +490,259 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     GGML_METAL_KERNEL_TYPE_COUNT
 };
 
+//
+// ggml_metal_heap
+//
+
+struct ggml_metal_heap {
+    // number of times the heap was unused
+    int n_unused;
+
+    // total number of buffer allocations in this heap across all computes
+    int64_t n_alloc;
+
+    // current offset in the heap - we reset this after each node in order to reuse the memory
+    size_t offs;
+
+    // the currently allocated MTLBuffer objects in this heap
+    id<MTLHeap> obj;
+
+    NSMutableArray * bufs;
+};
+
+static struct ggml_metal_heap * ggml_metal_heap_init(id<MTLDevice> device, size_t size) {
+    struct ggml_metal_heap * heap = calloc(1, sizeof(struct ggml_metal_heap));
+
+    MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init];
+    desc.storageMode  = MTLStorageModePrivate;
+    desc.cpuCacheMode = MTLCPUCacheModeDefaultCache;
+    desc.type         = MTLHeapTypePlacement;
+    desc.size         = size;
+
+    heap->n_unused = 0;
+    heap->n_alloc = 0;
+
+    heap->obj = [device newHeapWithDescriptor:desc];
+    if (!heap->obj) {
+        GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size);
+
+        free(heap);
+
+        return false;
+    }
+
+    [desc release];
+
+    heap->bufs = [[NSMutableArray alloc] init];
+
+    return heap;
+}
+
+static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) {
+    heap->offs = 0;
+
+    // count how many graph computes the heap ended up being unused
+    if ([heap->bufs count] > 0) {
+        heap->n_unused = 0;
+    } else {
+        heap->n_unused++;
+    }
+
+    for (id<MTLBuffer> buf in heap->bufs) {
+        [buf release];
+    }
+    [heap->bufs removeAllObjects];
+
+    // tell the OS that it can reuse this memory if needed
+    // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc
+    [heap->obj setPurgeableState:MTLPurgeableStateVolatile];
+}
+
+static void ggml_metal_heap_free(struct ggml_metal_heap * heap) {
+    if (heap == nil) {
+        return;
+    }
+
+    ggml_metal_heap_reset(heap);
+
+    [heap->obj  release];
+    [heap->bufs release];
+
+    free(heap);
+}
+
+@interface ggml_metal_heap_ptr : NSObject
+
+@property (nonatomic, assign) struct ggml_metal_heap * data;
+
+@end
+
+@implementation ggml_metal_heap_ptr
+@end
+
+//
+// ggml_metal_mem_pool
+//
+
+struct ggml_metal_mem_pool {
+    id<MTLDevice> device;
+
+    int n_heaps; // total number of heaps ever created (including those that were removed)
+
+    NSMutableArray * heaps;
+    NSMutableArray * heaps_to_remove;
+};
+
+static struct ggml_metal_mem_pool * ggml_metal_mem_pool_init(void) {
+    struct ggml_metal_mem_pool * mem_pool = calloc(1, sizeof(struct ggml_metal_mem_pool));
+
+    mem_pool->n_heaps = 0;
+
+    mem_pool->heaps           = [[NSMutableArray alloc] init];
+    mem_pool->heaps_to_remove = [[NSMutableArray alloc] init];
+
+    return mem_pool;
+}
+
+static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) {
+    GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu (total = %d)\n", __func__, [mem_pool->heaps count], mem_pool->n_heaps);
+
+    size_t size_all = 0;
+    size_t size_cur = 0;
+
+    for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
+        GGML_LOG_DEBUG("%s:   heap: %p\n",                __func__, (void *) ptr.data);
+        GGML_LOG_DEBUG("%s:     n_alloc:  %" PRId64 "\n", __func__, ptr.data->n_alloc);
+        GGML_LOG_DEBUG("%s:     n_unused: %d\n",          __func__, ptr.data->n_unused);
+        GGML_LOG_DEBUG("%s:     size:     %.2f MiB\n",    __func__, [ptr.data->obj size] / 1024.0 / 1024.0);
+        GGML_LOG_DEBUG("%s:     bufs:     %zu\n",         __func__, [ptr.data->bufs count]);
+
+        if ([ptr.data->bufs count] > 0) {
+            size_cur += [ptr.data->obj size];
+        }
+        size_all += [ptr.data->obj size];
+
+        ggml_metal_heap_free(ptr.data);
+        [ptr release];
+    }
+    [mem_pool->heaps           release];
+    [mem_pool->heaps_to_remove release];
+
+    if (size_all > 0) {
+        GGML_LOG_DEBUG("%s:   size_all: %.2f MiB\n", __func__, size_all / 1024.0 / 1024.0);
+        GGML_LOG_DEBUG("%s:   size_cur: %.2f MiB\n", __func__, size_cur / 1024.0 / 1024.0);
+    }
+
+    free(mem_pool);
+}
+
+static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) {
+    for (NSUInteger i = 0; i < [mem_pool->heaps count]; i++) {
+        ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:i];
+
+        struct ggml_metal_heap * heap = ptr.data;
+        ggml_metal_heap_reset(heap);
+
+        // if the heap hasn't been used for a while, remove it
+        if (heap->n_unused >= 128) {
+            [mem_pool->heaps_to_remove addObject:@(i)];
+        }
+    }
+
+    if (mem_pool->heaps_to_remove.count > 0) {
+        for (NSUInteger i = 0; i < [mem_pool->heaps_to_remove count]; i++) {
+            NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue];
+            ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index];
+
+            struct ggml_metal_heap * heap = ptr.data;
+            ggml_metal_heap_free(heap);
+
+            [mem_pool->heaps removeObjectAtIndex:index];
+            [ptr release];
+        }
+
+        [mem_pool->heaps_to_remove removeAllObjects];
+    }
+}
+
+static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) {
+    for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
+        ptr.data->offs = 0;
+    }
+}
+
+static id<MTLBuffer> ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) {
+    const size_t alignment = 32;
+
+    const size_t size_aligned = GGML_PAD(size, alignment);
+
+    // try one of the existing heaps
+    for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) {
+        struct ggml_metal_heap * heap = ptr.data;
+        if (heap->offs + size_aligned <= [heap->obj size]) {
+            // if this is the first buffer in the heap for the current command buffer, tell the OS that
+            //   it cannot free the memory used by the heap
+            // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc
+            if ([heap->bufs count] == 0) {
+                [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile];
+            }
+
+            id<MTLBuffer> buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs];
+            if (buf == nil) {
+                GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned);
+                return nil;
+            }
+
+            heap->n_alloc++;
+            heap->offs += size_aligned;
+
+            [heap->bufs addObject:buf];
+
+            return buf;
+        }
+    }
+
+    // create a new heap that can fit this buffer
+    ggml_metal_heap_ptr * heap_ptr = [ggml_metal_heap_ptr new];
+
+    struct ggml_metal_heap * heap = ggml_metal_heap_init(mem_pool->device, size_aligned);
+    if (heap == NULL) {
+        GGML_LOG_ERROR("%s: error: failed to create heap of size %zu\n", __func__, size_aligned);
+        return NULL;
+    }
+
+    //GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]);
+
+    heap_ptr.data = heap;
+    ggml_metal_heap_reset(heap);
+
+    [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile];
+    id<MTLBuffer> buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs];
+    if (buf == nil) {
+        GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned);
+        return NULL;
+    }
+
+    heap->n_alloc++;
+    heap->offs += size_aligned;
+
+    [heap->bufs addObject:buf];
+
+    [mem_pool->heaps addObject:heap_ptr];
+    mem_pool->n_heaps++;
+
+    return buf;
+}
+
+struct ggml_metal_command_buffer {
+    id<MTLCommandBuffer> obj;
+
+    // each command buffer has a memory pool from which it can allocate temporary buffers during the compute
+    struct ggml_metal_mem_pool * mem_pool;
+};
+
 struct ggml_backend_metal_context {
+    id<MTLDevice>       device;
     id<MTLCommandQueue> queue;
 
     dispatch_queue_t d_queue;
@@ -515,7 +767,7 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte
     void (^encode_async)(size_t ith);
 
     // n_cb command buffers + 1 used by the main thread
-    id<MTLCommandBuffer> command_buffers[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
+    struct ggml_metal_command_buffer cmd_bufs[GGML_METAL_MAX_COMMAND_BUFFERS + 1];
 
     // abort ggml_metal_graph_compute if callback returns true
     ggml_abort_callback abort_callback;
@@ -705,9 +957,11 @@ @implementation GGMLMetalClass
     struct ggml_backend_metal_device_context * ctx_dev = dev->context;
 
     id<MTLDevice> device = ggml_backend_metal_device_acq(ctx_dev);
+
     GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
 
-    ctx->queue  = [device newCommandQueue];
+    ctx->device = device;
+    ctx->queue = [device newCommandQueue];
     if (ctx->queue == nil) {
         GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
         return NULL;
@@ -768,7 +1022,10 @@ @implementation GGMLMetalClass
     ctx->gf = nil;
     ctx->encode_async = nil;
     for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
-        ctx->command_buffers[i] = nil;
+        ctx->cmd_bufs[i].obj = nil;
+
+        ctx->cmd_bufs[i].mem_pool = ggml_metal_mem_pool_init();
+        ctx->cmd_bufs[i].mem_pool->device = device;
     }
 
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
@@ -1181,6 +1438,12 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) {
 
     [ctx->queue release];
 
+    for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) {
+        // ctx->cmd_bufs[i].obj is auto released
+
+        ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool);
+    }
+
     dispatch_release(ctx->d_queue);
 
     free(ctx);
@@ -1486,10 +1749,11 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
     }
 }
 
-static void ggml_metal_encode_node(
+static bool ggml_metal_encode_node(
                         ggml_backend_t   backend,
                                    int   idx,
-          id<MTLComputeCommandEncoder>   encoder) {
+          id<MTLComputeCommandEncoder>   encoder,
+            struct ggml_metal_mem_pool * mem_pool) {
     struct ggml_backend_metal_context        * ctx     = backend->context;
     struct ggml_backend_metal_device_context * ctx_dev = backend->device->context;
 
@@ -1505,7 +1769,7 @@ static void ggml_metal_encode_node(
     struct ggml_tensor * dst  = node;
 
     if (ggml_is_empty(dst)) {
-        return;
+        return true;
     }
 
     switch (dst->op) {
@@ -1516,7 +1780,7 @@ static void ggml_metal_encode_node(
         case GGML_OP_PERMUTE:
             {
                 // noop -> next node
-            } return;
+            } return true;
         default:
             {
             } break;
@@ -1527,6 +1791,8 @@ static void ggml_metal_encode_node(
         GGML_ABORT("unsupported op");
     }
 
+    ggml_metal_mem_pool_clear(mem_pool);
+
     const int64_t  ne00 = src0 ? src0->ne[0] : 0;
     const int64_t  ne01 = src0 ? src0->ne[1] : 0;
     const int64_t  ne02 = src0 ? src0->ne[2] : 0;
@@ -2173,26 +2439,76 @@ static void ggml_metal_encode_node(
                 const float m0 = powf(2.0f, -(max_bias       ) / n_head_log2);
                 const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
 
-                ggml_metal_kargs_soft_max args = {
+// use this branch to test the ggml_metal_mem_pool functionality
+#if 0
+                // cpy to tmp buffer in MTLHeap
+
+                id<MTLBuffer> h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0));
+                if (!h_src0) {
+                    GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0));
+                    return false;
+                }
+
+                offs_src0 = 0;
+
+                ggml_metal_kargs_cpy args_cpy = {
                     /*.ne00 =*/ ne00,
                     /*.ne01 =*/ ne01,
                     /*.ne02 =*/ ne02,
-                    /*.scale =*/ scale,
-                    /*.max_bias =*/ max_bias,
-                    /*.m0 =*/ m0,
-                    /*.m1 =*/ m1,
+                    /*.ne03 =*/ ne03,
+                    /*.nb00 =*/ nb00,
+                    /*.nb01 =*/ nb01,
+                    /*.nb02 =*/ nb02,
+                    /*.nb03 =*/ nb03,
+                    /*.ne0  =*/ ne00,
+                    /*.ne1  =*/ ne01,
+                    /*.ne2  =*/ ne02,
+                    /*.ne3  =*/ ne03,
+                    /*.nb0  =*/ nb00,
+                    /*.nb1  =*/ nb01,
+                    /*.nb2  =*/ nb02,
+                    /*.nb3  =*/ nb03,
+                };
+
+                if (src0->type == GGML_TYPE_F16) {
+                    [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline];
+                } else {
+                    [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline];
+                }
+                [encoder setBytes:&args_cpy length:sizeof(args_cpy) atIndex:0];
+                [encoder setBuffer:id_src0  offset:offs_src0        atIndex:1];
+                [encoder setBuffer:h_src0   offset:0                atIndex:2];
+
+                GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0);
+                int nth_cpy = MIN(1024, ne00 / ggml_blck_size(src0->type));
+
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth_cpy, 1, 1)];
+
+#else
+                id<MTLBuffer> h_src0 = id_src0;
+#endif
+                // softmax
+
+                ggml_metal_kargs_soft_max args = {
+                    /*.ne00        =*/ ne00,
+                    /*.ne01        =*/ ne01,
+                    /*.ne02        =*/ ne02,
+                    /*.scale       =*/ scale,
+                    /*.max_bias    =*/ max_bias,
+                    /*.m0          =*/ m0,
+                    /*.m1          =*/ m1,
                     /*.n_head_log2 =*/ n_head_log2,
                 };
 
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0   atIndex:0];
+                [encoder setBuffer:h_src0 offset:offs_src0      atIndex:0];
                 if (id_src1) {
-                    [encoder setBuffer:id_src1 offset:offs_src1   atIndex:1];
+                    [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
                 } else {
-                    [encoder setBuffer:id_src0 offset:offs_src0   atIndex:1];
+                    [encoder setBuffer:h_src0 offset:offs_src0  atIndex:1];
                 }
-                [encoder setBuffer:id_dst      offset:offs_dst            atIndex:2];
-                [encoder setBytes:&args        length:sizeof(args)        atIndex:3];
+                [encoder setBuffer:id_dst offset:offs_dst       atIndex:2];
+                [encoder setBytes:&args   length:sizeof(args)   atIndex:3];
 
                 [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
 
@@ -4601,6 +4917,8 @@ static void ggml_metal_encode_node(
                 GGML_ABORT("fatal error");
             }
     }
+
+    return true;
 }
 
 static enum ggml_status ggml_metal_graph_compute(
@@ -4654,25 +4972,25 @@ static enum ggml_status ggml_metal_graph_compute(
         }
 
         // the main thread commits the first few commands immediately
-        // command_buffer[n_cb]
+        // cmd_buf[n_cb]
         {
-            id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
-            ctx->command_buffers[n_cb] = command_buffer;
+            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+            ctx->cmd_bufs[n_cb].obj = cmd_buf;
 
-            [command_buffer enqueue];
+            [cmd_buf enqueue];
             ctx->encode_async(n_cb);
         }
 
         // prepare the rest of the command buffers asynchronously
-        // command_buffer[0.. n_cb)
+        // cmd_buf[0.. n_cb)
         for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-            id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
-            ctx->command_buffers[cb_idx] = command_buffer;
+            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+            ctx->cmd_bufs[cb_idx].obj = cmd_buf;
 
             // always enqueue the first two command buffers
             // enqueue all of the command buffers if we don't need to abort
             if (cb_idx < 2 || ctx->abort_callback == NULL) {
-                [command_buffer enqueue];
+                [cmd_buf enqueue];
             }
         }
 
@@ -4681,14 +4999,14 @@ static enum ggml_status ggml_metal_graph_compute(
         // wait for completion and check status of each command buffer
         // needed to detect if the device ran out-of-memory for example (#1881)
         {
-            id<MTLCommandBuffer> command_buffer = ctx->command_buffers[n_cb];
-            [command_buffer waitUntilCompleted];
+            id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[n_cb].obj;
+            [cmd_buf waitUntilCompleted];
 
-            MTLCommandBufferStatus status = [command_buffer status];
+            MTLCommandBufferStatus status = [cmd_buf status];
             if (status != MTLCommandBufferStatusCompleted) {
                 GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, n_cb, status);
                 if (status == MTLCommandBufferStatusError) {
-                    GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
+                    GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
                 }
 
                 return GGML_STATUS_FAILED;
@@ -4696,20 +5014,20 @@ static enum ggml_status ggml_metal_graph_compute(
         }
 
         for (int i = 0; i < n_cb; ++i) {
-            id<MTLCommandBuffer> command_buffer = ctx->command_buffers[i];
-            [command_buffer waitUntilCompleted];
+            id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[i].obj;
+            [cmd_buf waitUntilCompleted];
 
-            MTLCommandBufferStatus status = [command_buffer status];
+            MTLCommandBufferStatus status = [cmd_buf status];
             if (status != MTLCommandBufferStatusCompleted) {
                 GGML_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
                 if (status == MTLCommandBufferStatusError) {
-                    GGML_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
+                    GGML_LOG_INFO("error: %s\n", [[cmd_buf error].localizedDescription UTF8String]);
                 }
 
                 return GGML_STATUS_FAILED;
             }
 
-            id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->command_buffers[i + 1] : nil);
+            id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? ctx->cmd_bufs[i + 1].obj : nil);
             if (!next_buffer) {
                 continue;
             }
@@ -5092,8 +5410,9 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
 
         const int n_nodes_per_cb = ctx->n_nodes_per_cb;
 
-        id<MTLCommandBuffer> command_buffer  = ctx->command_buffers[cb_idx];
-        id<MTLComputeCommandEncoder> encoder = [command_buffer computeCommandEncoder];
+        id<MTLCommandBuffer> cmd_buf = ctx->cmd_bufs[cb_idx].obj;
+
+        id<MTLComputeCommandEncoder> encoder = [cmd_buf computeCommandEncoder];
 
         int node_start = 0;
         int node_end   = n_nodes_0;
@@ -5105,22 +5424,29 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
 
         const bool should_capture = ctx->capture_next_compute;
 
+        struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool;
+        ggml_metal_mem_pool_reset(mem_pool);
+
         for (int idx = node_start; idx < node_end; ++idx) {
             if (should_capture) {
                 [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(ggml_graph_node(ctx->gf, idx)) encoding:NSUTF8StringEncoding]];
             }
 
-            ggml_metal_encode_node(backend, idx, encoder);
+            const bool res = ggml_metal_encode_node(backend, idx, encoder, mem_pool);
 
             if (should_capture) {
                 [encoder popDebugGroup];
             }
+
+            if (!res) {
+                break;
+            }
         }
 
         [encoder endEncoding];
 
         if (cb_idx < 2 || ctx->abort_callback == NULL) {
-            [command_buffer commit];
+            [cmd_buf commit];
         }
     });
 }

From ab47dec3d37aa1927c2ec590e166b76141374ed3 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Tue, 22 Apr 2025 16:16:10 +0300
Subject: [PATCH 010/200] security : add note about RPC and server
 functionality (#13061)

* security : add note about RPC functionality

* security : add note about llama-server
---
 SECURITY.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/SECURITY.md b/SECURITY.md
index 6a1bb6c32cd8e..9370fb1a88321 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -40,7 +40,8 @@ To protect sensitive data from potential leaks or unauthorized access, it is cru
 ### Untrusted environments or networks
 
 If you can't run your models in a secure and isolated environment or if it must be exposed to an untrusted network, make sure to take the following security precautions:
-* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value
+* Do not use the RPC backend, [rpc-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/rpc) and [llama-server](https://github.com/ggml-org/llama.cpp/tree/master/examples/server) functionality (see https://github.com/ggml-org/llama.cpp/pull/13061).
+* Confirm the hash of any downloaded artifact (e.g. pre-trained model weights) matches a known-good value.
 * Encrypt your data if sending it over the network.
 
 ### Multi-Tenant environments

From dc39a5e7a84815a90fa0c515ed8927870cf858c9 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Tue, 22 Apr 2025 16:24:54 +0200
Subject: [PATCH 011/200] mtmd : support SmolVLM (version 1 and 2) (#13050)

* mtmd : support SmolVLM (version 1 and 2)

* correct chat template

* fix n_patches

* scale_factor is an int

* add more models to test
---
 convert_hf_to_gguf.py          | 100 +++++++++++++++++++-----
 examples/llava/clip-impl.h     |   7 +-
 examples/llava/clip.cpp        | 135 +++++++++++++++++++++++----------
 examples/llava/mtmd.cpp        |   7 ++
 examples/llava/tests.sh        |  11 ++-
 gguf-py/gguf/constants.py      |   9 +++
 gguf-py/gguf/gguf_writer.py    |  47 ++++++++++++
 gguf-py/gguf/tensor_mapping.py |   4 +-
 src/llama-chat.cpp             |  23 +++++-
 src/llama-chat.h               |   1 +
 10 files changed, 279 insertions(+), 65 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 6d34541a3cecc..645bdad9b57d2 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -419,8 +419,12 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str]
     def load_hparams(dir_model: Path):
         with open(dir_model / "config.json", "r", encoding="utf-8") as f:
             hparams = json.load(f)
+            architectures = hparams.get("architectures")
             if "text_config" in hparams:
                 hparams = {**hparams, **hparams["text_config"]}
+            if architectures is not None:
+                # preserve "architectures" from root level config
+                hparams["architectures"] = architectures
             return hparams
 
     @classmethod
@@ -1061,6 +1065,8 @@ def _set_vocab_builtin(self, model_name: Literal["gpt-neox", "llama-spm"], vocab
 class VisionModel(ModelBase):
     model_arch = gguf.MODEL_ARCH.CLIP_VISION
     n_text_embd = 0
+    preprocessor_config: dict[str, Any]
+    global_config: dict[str, Any]
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -1075,24 +1081,33 @@ def __init__(self, *args, **kwargs):
 
         if "vision_config" not in self.hparams:
             raise ValueError("vision_config not found in hparams")
-        # move vision config to the top level
+        # move vision config to the top level, while preserving the original hparams in global_config
+        self.global_config = self.hparams
         self.hparams = self.hparams["vision_config"]
 
+        # load preprocessor config
+        with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
+            self.preprocessor_config = json.load(f)
+
     def set_type(self):
         self.gguf_writer.add_type(gguf.GGUFType.CLIP_VISION)
 
     def set_gguf_parameters(self):
         self.gguf_writer.add_file_type(self.ftype)
-        self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PROJECTION_DIM, self.n_embd_text)
-        self.gguf_writer.add_bool(gguf.Keys.ClipVision.HAS_VISION_ENCODER, True)
+        self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
+        self.gguf_writer.add_vision_has_vision_encoder(True)
 
         # vision config
-        self.gguf_writer.add_uint32(gguf.Keys.ClipVision.IMAGE_SIZE,           self.find_hparam(["image_size"]))
-        self.gguf_writer.add_uint32(gguf.Keys.ClipVision.PATCH_SIZE,           self.find_hparam(["patch_size"]))
-        self.gguf_writer.add_uint32(gguf.Keys.ClipVision.EMBEDDING_LENGTH,     self.find_hparam(["hidden_size"]))
-        self.gguf_writer.add_uint32(gguf.Keys.ClipVision.FEED_FORWARD_LENGTH,  self.find_hparam(["intermediate_size"]))
-        self.gguf_writer.add_uint32(gguf.Keys.ClipVision.BLOCK_COUNT,          self.find_hparam(["num_hidden_layers"]))
-        self.gguf_writer.add_uint32(gguf.Keys.ClipVision.Attention.HEAD_COUNT, self.find_hparam(["num_attention_heads"]))
+        self.gguf_writer.add_vision_image_size(self.find_hparam(["image_size"]))
+        self.gguf_writer.add_vision_patch_size(self.find_hparam(["patch_size"]))
+        self.gguf_writer.add_vision_embedding_length(self.find_hparam(["hidden_size"]))
+        self.gguf_writer.add_vision_feed_forward_length(self.find_hparam(["intermediate_size"]))
+        self.gguf_writer.add_vision_block_count(self.find_hparam(["num_hidden_layers"]))
+        self.gguf_writer.add_vision_head_count(self.find_hparam(["num_attention_heads"]))
+
+        # preprocessor config
+        self.gguf_writer.add_vision_image_mean(self.preprocessor_config["image_mean"])
+        self.gguf_writer.add_vision_image_std(self.preprocessor_config["image_mean"])
 
     def write_vocab(self):
         raise ValueError("VisionModel does not support vocab writing")
@@ -1703,11 +1718,23 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed norms: {norms}")
 
 
-@ModelBase.register("LLaMAForCausalLM", "LlamaForCausalLM", "MistralForCausalLM", "MixtralForCausalLM")
+@ModelBase.register(
+    "LLaMAForCausalLM",
+    "LlamaForCausalLM",
+    "MistralForCausalLM",
+    "MixtralForCausalLM",
+    "Idefics3ForConditionalGeneration",
+    "SmolVLMForConditionalGeneration")
 class LlamaModel(TextModel):
     model_arch = gguf.MODEL_ARCH.LLAMA
     undo_permute = True
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # fix for SmolVLM2, missing `num_attention_heads` in config.json
+        if self.hparams["architectures"][0] == "SmolVLMForConditionalGeneration":
+            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
+
     def set_vocab(self):
         try:
             self._set_vocab_sentencepiece()
@@ -1770,6 +1797,12 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         n_head = self.hparams["num_attention_heads"]
         n_kv_head = self.hparams.get("num_key_value_heads")
+        is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
+
+        if is_vision_tensor:
+            return [] # skip vision tensors
+        elif name.startswith("model.text_model"):
+            name = name.replace("text_model.", "") # for SmolVLM
 
         if self.undo_permute:
             if name.endswith(("q_proj.weight", "q_proj.bias")):
@@ -1852,6 +1885,41 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
+@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
+class SmolVLMModel(VisionModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # fix for SmolVLM2, missing some keys in config.json
+        # default values are taken from transformers code
+        if self.hparams["model_type"] == "smolvlm_vision":
+            self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1152)
+            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
+            self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 3072)
+            self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 12)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.IDEFICS3)
+        self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-5))
+        self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
+        self.gguf_writer.add_vision_use_gelu(True)
+
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        del bid, new_name, n_dims  # unused
+        if ".embeddings." in name:
+            return gguf.GGMLQuantizationType.F32
+        return False
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+        is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
+
+        if is_vision_tensor:
+            return [(self.map_tensor_name(name), data_torch)]
+
+        return [] # skip other tensors
+
+
 @ModelBase.register("Llama4ForConditionalGeneration")
 class Llama4Model(LlamaModel):
     model_arch = gguf.MODEL_ARCH.LLAMA4
@@ -3591,12 +3659,10 @@ class Gemma3VisionModel(VisionModel):
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         hparams = self.hparams
-        self.gguf_writer.add_string(gguf.Keys.ClipVision.PROJECTOR_TYPE, "gemma3")
+        self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.GEMMA3)
         # default values below are taken from HF tranformers code
-        self.gguf_writer.add_float32(gguf.Keys.ClipVision.Attention.LAYERNORM_EPS, hparams.get("layer_norm_eps", 1e-6))
-        self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_MEAN, [0.5, 0.5, 0.5])
-        self.gguf_writer.add_array(gguf.Keys.ClipVision.IMAGE_STD,  [0.5, 0.5, 0.5])
-        self.gguf_writer.add_bool (gguf.Keys.ClipVision.USE_GELU,   True)
+        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("layer_norm_eps", 1e-6))
+        self.gguf_writer.add_vision_use_gelu(True)
 
     def tensor_force_quant(self, name, new_name, bid, n_dims):
         del bid, new_name, n_dims  # unused
@@ -3614,10 +3680,6 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                 or name.startswith("multimodal_projector.") or name.startswith("vision_model."):
             # process vision tensors
             name = name.replace("_weight", ".weight")
-            if "fc1" in name:
-                name = name.replace("fc1", "fc2")
-            else:
-                name = name.replace("fc2", "fc1")
 
             # correct norm value ; only this "soft_emb_norm" need to be corrected as it's part of Gemma projector
             # the other norm values are part of SigLIP model, and they are already correct
diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h
index 180ae9880b124..faa8a9d5e9dbb 100644
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@@ -33,13 +33,13 @@
 #define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
 #define KEY_PROJ_DIM            "clip.%s.projection_dim"
 #define KEY_TOKENS              "tokenizer.ggml.tokens"
-#define KEY_N_POSITIONS         "clip.text.context_length"
 #define KEY_IMAGE_SIZE          "clip.vision.image_size"
 #define KEY_PATCH_SIZE          "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN          "clip.vision.image_mean"
 #define KEY_IMAGE_STD           "clip.vision.image_std"
-#define KEY_PROJ_TYPE           "clip.projector_type"
 #define KEY_FEATURE_LAYER       "clip.vision.feature_layer"
+#define KEY_PROJ_SCALE_FACTOR   "clip.vision.projector.scale_factor"
+#define KEY_PROJ_TYPE           "clip.projector_type"
 
 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
@@ -72,6 +72,7 @@
 #define TN_IMAGE_NEWLINE   "model.image_newline"
 #define TN_MM_INP_PROJ     "mm.input_projection.weight" // gemma3
 #define TN_MM_SOFT_EMB_N   "mm.soft_emb_norm.weight"    // gemma3
+#define TN_MM_PROJECTOR    "mm.model.fc.weight"         // idefics3
 
 // mimicpmv
 #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
@@ -99,6 +100,7 @@ enum projector_type {
     PROJECTOR_TYPE_GLM_EDGE,
     PROJECTOR_TYPE_MERGER,
     PROJECTOR_TYPE_GEMMA3,
+    PROJECTOR_TYPE_IDEFICS3,
     PROJECTOR_TYPE_UNKNOWN,
 };
 
@@ -110,6 +112,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_GLM_EDGE,  "adapter"},
     { PROJECTOR_TYPE_MERGER,    "qwen2vl_merger"},
     { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
+    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
 };
 
 static projector_type clip_projector_type_from_string(const std::string & str) {
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 759706156d515..ad39e1bd6194c 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -159,6 +159,7 @@ struct clip_hparams {
     int32_t projection_dim;
     int32_t n_head;
     int32_t n_layer;
+    int32_t proj_scale_factor = 0; // idefics3
 
     patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
 
@@ -506,6 +507,35 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
         embeddings = ggml_mul_mat(ctx0,
             ggml_cont(ctx0, ggml_transpose(ctx0, model.mm_input_proj_w)),
             embeddings);
+
+    } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
+        // https://github.com/huggingface/transformers/blob/0a950e0bbe1ed58d5401a6b547af19f15f0c195e/src/transformers/models/idefics3/modeling_idefics3.py#L578
+
+        ggml_tensor * cur = embeddings;
+        const int scale_factor = model.hparams.proj_scale_factor;
+        const int n_embd = cur->ne[0];
+        const int seq    = cur->ne[1];
+        const int bsz    = 1; // batch size, always 1 for now since we don't support batching
+        const int height = std::sqrt(seq);
+        const int width  = std::sqrt(seq);
+        GGML_ASSERT(scale_factor != 0);
+        cur = ggml_reshape_4d(ctx0, cur, n_embd * scale_factor, width / scale_factor, height, bsz);
+        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+        cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
+            n_embd * scale_factor * scale_factor,
+            height / scale_factor,
+            width / scale_factor,
+            bsz);
+        cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
+        cur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, cur),
+            n_embd * scale_factor * scale_factor,
+            seq / (scale_factor * scale_factor),
+            bsz);
+
+        cur = ggml_mul_mat(ctx0, model.projection, cur);
+        embeddings = cur;
+    } else {
+        GGML_ABORT("SigLIP: Unsupported projector type");
     }
 
     // build the graph
@@ -1081,12 +1111,20 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
 }
 
 static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
-    if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
-        return clip_image_build_graph_siglip(ctx, imgs);
-    } else {
-        // TODO: we should have one build_* function per model
-        return clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
+    ggml_cgraph * res;
+    switch (ctx->proj_type) {
+        case PROJECTOR_TYPE_GEMMA3:
+        case PROJECTOR_TYPE_IDEFICS3:
+            {
+                res = clip_image_build_graph_siglip(ctx, imgs);
+            } break;
+        default:
+            {
+                // TODO: we should have one build_* function per model
+                res = clip_image_build_graph_legacy(ctx, imgs, load_image_size, is_inf);
+            } break;
     }
+    return res;
 }
 
 struct clip_model_loader {
@@ -1147,6 +1185,8 @@ struct clip_model_loader {
     }
 
     void load_hparams() {
+        auto & hparams = ctx_clip.vision_model.hparams;
+
         // projector type
         {
             std::string proj_type;
@@ -1177,7 +1217,6 @@ struct clip_model_loader {
             get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false);
             get_bool(KEY_USE_SILU, ctx_clip.use_silu, false);
 
-            auto & hparams = ctx_clip.vision_model.hparams;
             get_u32(string_format(KEY_N_EMBD,         "vision"), hparams.hidden_size);
             get_u32(string_format(KEY_N_HEAD,         "vision"), hparams.n_head);
             get_u32(string_format(KEY_N_FF,           "vision"), hparams.n_intermediate);
@@ -1233,6 +1272,16 @@ struct clip_model_loader {
             LOG_INF("%s: model size:         %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
             LOG_INF("%s: metadata size:      %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
         }
+
+        // model-specific params
+        switch (ctx_clip.proj_type) {
+            case PROJECTOR_TYPE_IDEFICS3:
+                {
+                    get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
+                } break;
+            default:
+                break;
+        }
     }
 
     void load_tensors() {
@@ -1422,6 +1471,10 @@ struct clip_model_loader {
                     vision_model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
                     vision_model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
                 } break;
+            case PROJECTOR_TYPE_IDEFICS3:
+                {
+                    vision_model.projection = get_tensor(TN_MM_PROJECTOR);
+                } break;
             default:
                 GGML_ASSERT(false && "unknown projector type");
         }
@@ -2195,10 +2248,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         return true;
     }
 
-    if (ctx->has_glm_projector || ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
+    if (ctx->has_glm_projector
+            || ctx->proj_type == PROJECTOR_TYPE_GEMMA3
+            || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
         clip_image_u8 resized_image;
         int sz = params.image_size;
-        image_manipulation::bicubic_resize(*img, resized_image, sz, sz);
+        image_manipulation::resize_and_pad_image(*img, resized_image, {sz, sz});
         clip_image_f32_ptr img_f32(clip_image_f32_init());
         //clip_image_save_to_bmp(resized_image, "resized.bmp");
         normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
@@ -2330,6 +2385,8 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
         n_patches = x_patch * y_patch;
     } else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
         n_patches = 256;
+    } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
+        n_patches /= ctx->vision_model.hparams.proj_scale_factor;
     }
 
     return n_patches;
@@ -2597,6 +2654,9 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
             // do nothing
         }
+        else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
+            // do nothing
+        }
         else {
             struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
 
@@ -2783,37 +2843,34 @@ bool clip_model_quantize(const char * fname_inp, const char * fname_out, const i
 }
 
 int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
-    if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
-        return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
-    }
-    if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
-        return ctx->vision_model.mm_model_peg_0_b->ne[0];
-    }
-    if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
-        return ctx->vision_model.mm_2_b->ne[0];
-    }
-    if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
-        return ctx->vision_model.mm_3_b->ne[0];
-    }
-    if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
-        if (ctx->minicpmv_version == 2) {
-            return 4096;
-        }
-        else if (ctx->minicpmv_version == 3) {
-            return 3584;
-        }
-        else if (ctx->minicpmv_version == 4) {
-            return 3584;
-        }
-    }
-    if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE){
-        return ctx->vision_model.mm_model_mlp_3_w->ne[1];
-    }
-    if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
-        return ctx->vision_model.mm_1_b->ne[0];
-    }
-    if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
-        return ctx->vision_model.mm_input_proj_w->ne[0];
+    switch (ctx->proj_type) {
+        case PROJECTOR_TYPE_LDP:
+            return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0];
+        case PROJECTOR_TYPE_LDPV2:
+            return ctx->vision_model.mm_model_peg_0_b->ne[0];
+        case PROJECTOR_TYPE_MLP:
+            return ctx->vision_model.mm_2_b->ne[0];
+        case PROJECTOR_TYPE_MLP_NORM:
+            return ctx->vision_model.mm_3_b->ne[0];
+        case PROJECTOR_TYPE_RESAMPLER:
+            if (ctx->minicpmv_version == 2) {
+                return 4096;
+            } else if (ctx->minicpmv_version == 3) {
+                return 3584;
+            } else if (ctx->minicpmv_version == 4) {
+                return 3584;
+            }
+            break; // Should not happen if version is valid
+        case PROJECTOR_TYPE_GLM_EDGE:
+            return ctx->vision_model.mm_model_mlp_3_w->ne[1];
+        case PROJECTOR_TYPE_MERGER:
+            return ctx->vision_model.mm_1_b->ne[0];
+        case PROJECTOR_TYPE_GEMMA3:
+            return ctx->vision_model.mm_input_proj_w->ne[0];
+        case PROJECTOR_TYPE_IDEFICS3:
+            return ctx->vision_model.projection->ne[1];
+        default:
+            break; // Fall through to throw
     }
 
     std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 8866c12e4ddef..c3fb2f18a2b02 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -176,6 +176,8 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
 
     std::string prompt_modified(text.text);
     std::string marker_modified(ctx->image_marker);
+    projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
+
     // a bit hacky here, but works for now
     // for some models, we need to add prefix and suffix to the image embeddings
     if (clip_is_gemma3(ctx->ctx_clip)) {
@@ -183,6 +185,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
         // <start_of_image> ... (image embeddings) ... <end_of_image>
         marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+
+    } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
+        // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
+        marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
+        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
     }
 
     // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
diff --git a/examples/llava/tests.sh b/examples/llava/tests.sh
index 61ebb3ac18ead..8752fc267a8ac 100755
--- a/examples/llava/tests.sh
+++ b/examples/llava/tests.sh
@@ -28,6 +28,9 @@ add_test() {
     arr_tmpl+=("$tmpl")
 }
 
+add_test "llama-mtmd-cli"  "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
+add_test "llama-mtmd-cli"  "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
+add_test "llama-mtmd-cli"  "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
 add_test "llama-mtmd-cli"  "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
 add_test "llama-mtmd-cli"  "guinmoon/MobileVLM-3B-GGUF:Q4_K_M"               "deepseek"
 add_test "llama-mtmd-cli"  "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
@@ -39,7 +42,13 @@ add_test "llama-mtmd-cli"  "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
 add_test "llama-mtmd-cli"  "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
 add_test "llama-qwen2vl-cli"  "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
 
-# add_test "llama-mtmd-cli"  "cmp-nct/Yi-VL-6B-GGUF:Q5_K"  # this model has broken chat template, not usable
+# these models always give the wrong answer, not sure why
+# add_test "llama-mtmd-cli"  "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
+# add_test "llama-mtmd-cli"  "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0"
+# add_test "llama-mtmd-cli"  "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0"
+
+# this model has broken chat template, not usable
+# add_test "llama-mtmd-cli"  "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
 
 ###############
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 3f24705201d93..59510bd0c2a2a 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -231,11 +231,15 @@ class ClipVision:
         IMAGE_MEAN          = "clip.vision.image_mean"
         IMAGE_STD           = "clip.vision.image_std"
         USE_GELU            = "clip.use_gelu"
+        USE_SILU            = "clip.use_silu"
 
         class Attention:
             HEAD_COUNT      = "clip.vision.attention.head_count"
             LAYERNORM_EPS   = "clip.vision.attention.layer_norm_epsilon"
 
+        class Projector:
+            SCALE_FACTOR    = "clip.vision.projector.scale_factor"
+
 #
 # recommended mapping of model tensor names for storage in gguf
 #
@@ -2122,6 +2126,11 @@ def get_type(val: Any) -> GGUFValueType:
             raise ValueError(f"Unknown type: {type(val)}")
 
 
+class VisionProjectorType:
+    GEMMA3 = "gemma3"
+    IDEFICS3 = "idefics3"
+
+
 # Items here are (block size, type size)
 QK_K = 256
 GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index aef03db1577a7..48e9a470b78d6 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -931,6 +931,53 @@ def add_eot_token_id(self, id: int) -> None:
     def add_eom_token_id(self, id: int) -> None:
         self.add_uint32(Keys.Tokenizer.EOM_ID, id)
 
+    # for vision models
+
+    def add_vision_projection_dim(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)
+
+    def add_vision_has_vision_encoder(self, value: bool) -> None:
+        self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value)
+
+    def add_vision_patch_size(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.PATCH_SIZE, value)
+
+    def add_vision_embedding_length(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.EMBEDDING_LENGTH, value)
+
+    def add_vision_feed_forward_length(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.FEED_FORWARD_LENGTH, value)
+
+    def add_vision_block_count(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.BLOCK_COUNT, value)
+
+    def add_vision_head_count(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value)
+
+    def add_vision_projector_type(self, value: str) -> None:
+        self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)
+
+    def add_vision_attention_layernorm_eps(self, value: float) -> None:
+        self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value)
+
+    def add_vision_image_size(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)
+
+    def add_vision_image_mean(self, values: Sequence[float]) -> None:
+        self.add_array(Keys.ClipVision.IMAGE_MEAN, values)
+
+    def add_vision_image_std(self, values: Sequence[float]) -> None:
+        self.add_array(Keys.ClipVision.IMAGE_STD, values)
+
+    def add_vision_use_gelu(self, value: bool) -> None:
+        self.add_bool(Keys.ClipVision.USE_GELU, value)
+
+    def add_vision_use_silu(self, value: bool) -> None:
+        self.add_bool(Keys.ClipVision.USE_SILU, value)
+
+    def add_vision_projector_scale_factor(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
+
     def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
         pack_prefix = ''
         if not skip_pack_prefix:
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 22066b2868019..3ff378c136645 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -961,13 +961,13 @@ class TensorNameMap:
         MODEL_TENSOR.V_ENC_FFN_UP: (
             "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
             "vpm.encoder.layers.{bid}.mlp.fc1",
-            "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM
+            "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 (note: name is swapped)
         ),
 
         MODEL_TENSOR.V_ENC_FFN_DOWN: (
             "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
             "vpm.encoder.layers.{bid}.mlp.fc2",
-            "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM
+            "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 (note: name is swapped)
         ),
 
         MODEL_TENSOR.V_PRE_NORM: (
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
index f62850ca574b0..41f89e3a9d3bd 100644
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -62,6 +62,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
     { "yandex",            LLM_CHAT_TEMPLATE_YANDEX            },
     { "bailing",           LLM_CHAT_TEMPLATE_BAILING           },
     { "llama4",            LLM_CHAT_TEMPLATE_LLAMA4            },
+    { "smolvlm",           LLM_CHAT_TEMPLATE_SMOLVLM           },
 };
 
 llm_chat_template llm_chat_template_from_str(const std::string & name) {
@@ -81,7 +82,9 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
     if (tmpl_contains("<|im_start|>")) {
         return tmpl_contains("<|im_sep|>")
             ? LLM_CHAT_TEMPLATE_PHI_4
-            : LLM_CHAT_TEMPLATE_CHATML;
+            : tmpl_contains("<end_of_utterance>")
+                ? LLM_CHAT_TEMPLATE_SMOLVLM // SmolVLM uses <|im_start|> as BOS, but it is NOT chatml
+                : LLM_CHAT_TEMPLATE_CHATML;
     } else if (tmpl.find("mistral") == 0 || tmpl_contains("[INST]")) {
         if (tmpl_contains("[SYSTEM_PROMPT]")) {
             return LLM_CHAT_TEMPLATE_MISTRAL_V7;
@@ -622,7 +625,23 @@ int32_t llm_chat_apply_template(
         if (add_ass) {
             ss << "<|header_start|>assistant<|header_end|>\n\n";
         }
-    }  else {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_SMOLVLM) {
+        // SmolVLM
+        ss << "<|im_start|>"; // uses <|im_start|> as BOS, but the actual content is NOT chatml
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << message->content << "\n\n";
+            } else if (role == "user") {
+                ss << "User: " << message->content << "<end_of_utterance>\n";
+            } else {
+                ss << "Assistant: " << message->content << "<end_of_utterance>\n";
+            }
+        }
+        if (add_ass) {
+            ss << "Assistant:";
+        }
+    } else {
         // template not supported
         return -1;
     }
diff --git a/src/llama-chat.h b/src/llama-chat.h
index 34537ca21e46e..dc30df711a96e 100644
--- a/src/llama-chat.h
+++ b/src/llama-chat.h
@@ -41,6 +41,7 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_YANDEX,
     LLM_CHAT_TEMPLATE_BAILING,
     LLM_CHAT_TEMPLATE_LLAMA4,
+    LLM_CHAT_TEMPLATE_SMOLVLM,
     LLM_CHAT_TEMPLATE_UNKNOWN,
 };
 

From 658987cfc9d752dca7758987390d5fb1a7a0a54a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Tue, 22 Apr 2025 21:27:40 +0200
Subject: [PATCH 012/200] CUDA: noncont MMVQ + batched bs1 MUL_MAT_ID (#13014)

* CUDA: noncont MMVQ + batched bs1 MUL_MAT_ID

* fix logic for RoPE support, CUDA graphs
---
 ggml/src/ggml-cuda/ggml-cuda.cu | 165 +++++-----
 ggml/src/ggml-cuda/mmv.cu       | 181 ++++++-----
 ggml/src/ggml-cuda/mmv.cuh      |   2 +-
 ggml/src/ggml-cuda/mmvq.cu      | 527 ++++++++++++++++++--------------
 ggml/src/ggml-cuda/mmvq.cuh     |   3 +
 ggml/src/ggml-cuda/quantize.cu  |  66 ++--
 ggml/src/ggml-cuda/quantize.cuh |  12 +-
 ggml/src/ggml-cuda/vecdotq.cuh  |   2 +
 tests/test-backend-ops.cpp      |   2 +-
 9 files changed, 541 insertions(+), 419 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index a7febef723c2e..e0e0d2137f3be 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1410,6 +1410,11 @@ static void ggml_cuda_op_mul_mat(
     const int64_t ne0 = dst->ne[0];
     const int64_t ne1 = dst->ne[1];
 
+    // const int64_t nb10 = src1->nb[0];
+    const int64_t nb11 = src1->nb[1];
+    const int64_t nb12 = src1->nb[2];
+    const int64_t nb13 = src1->nb[3];
+
     const int64_t nb2 = dst->nb[2];
     const int64_t nb3 = dst->nb[3];
 
@@ -1545,7 +1550,10 @@ static void ggml_cuda_op_mul_mat(
             dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), src_1_ddq_size);
 
             if (src1_on_device && src1_is_contiguous) {
-                quantize_src1(dev[id].src1_ddf, dev[id].src1_ddq, ne10, ne11, ne12*ne13, src1_padded_col_size, src0->type, stream);
+                quantize_src1(
+                    dev[id].src1_ddf, dev[id].src1_ddq, src0->type, ne10,
+                    nb11/sizeof(float), nb12/sizeof(float), nb13/sizeof(float),
+                    src1_padded_col_size, ne11, ne12, ne13, stream);
                 CUDA_CHECK(cudaGetLastError());
             }
         }
@@ -1640,7 +1648,9 @@ static void ggml_cuda_op_mul_mat(
                 }
 
                 if (quantize_src1 && !src1_is_contiguous) {
-                    quantize_src1(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, 1, src1_padded_col_size, src0->type, stream);
+                    quantize_src1(
+                        src1_ddf_i, src1_ddq_i, src0->type, ne10, ne10, ne11*ne10, ne12*ne11*ne10,
+                        src1_padded_col_size, src1_ncols, 1, 1, stream);
                     CUDA_CHECK(cudaGetLastError());
                 }
 
@@ -1878,7 +1888,7 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
 static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
 
-    bool use_mul_mat_vec   = (src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
+    bool use_mul_mat_vec   = (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || src0->type == GGML_TYPE_BF16)
         && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
         && src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
     bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
@@ -1919,10 +1929,12 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
     //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
     //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
 
-    if (!split && use_mul_mat_vec && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
+    if (!split && use_mul_mat_vec && (src0->ne[1] <= MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
         // the custom F16 vector kernel can be used over batched cuBLAS GEMM
         // but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
-        ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
+        ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
+    } else if (!split && use_mul_mat_vec_q) {
+        ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
     } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
                && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
         // general KQ + KQV multi-batch without FlashAttention
@@ -1999,6 +2011,15 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
+    if (src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && ne2 == 1) {
+        if (ggml_is_quantized(src0->type)) {
+            ggml_cuda_mul_mat_vec_q(ctx, src0, src1, ids, dst);
+        } else {
+            ggml_cuda_mul_mat_vec(ctx, src0, src1, ids, dst);
+        }
+        return;
+    }
+
     GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
 
     cudaStream_t stream = ctx.stream();
@@ -2035,97 +2056,75 @@ static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, ggml_tensor *
     dst_row.nb[2] = nb1;
     dst_row.nb[3] = nb1;
 
-    if (ne12 == 1) {
-        for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
-            for (int64_t id = 0; id < n_ids; id++) {
-                const int32_t i02 = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
-
-                GGML_ASSERT(i02 >= 0 && i02 < n_as);
-
-                const int64_t i11 = id % ne11;
-                const int64_t i12 = iid1;
-
-                const int64_t i1 = id;
-                const int64_t i2 = i12;
-
-                src0_row.data = src0_original + i02*nb02;
-                src1_row.data = src1_original + i11*nb11 + i12*nb12;
-                dst_row.data  =  dst_original + i1*nb1   + i2*nb2;
-
-                ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
-            }
-        }
-    } else {
-        ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
-        ggml_cuda_pool_alloc<char>  dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
-
-        src1_row.data = src1_contiguous.get();
-        dst_row.data  =  dst_contiguous.get();
+    ggml_cuda_pool_alloc<char> src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1));
+    ggml_cuda_pool_alloc<char>  dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst));
 
-        for (int64_t i02 = 0; i02 < n_as; i02++) {
-            int64_t num_src1_rows = 0;
+    src1_row.data = src1_contiguous.get();
+    dst_row.data  =  dst_contiguous.get();
 
-            for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
-                for (int64_t id = 0; id < n_ids; id++) {
-                    const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
+    for (int64_t i02 = 0; i02 < n_as; i02++) {
+        int64_t num_src1_rows = 0;
 
-                    GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
+        for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) {
+            for (int64_t id = 0; id < n_ids; id++) {
+                const int32_t row_id_i = *(const int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]);
 
-                    if (row_id_i != i02) {
-                        continue;
-                    }
+                GGML_ASSERT(row_id_i >= 0 && row_id_i < n_as);
 
-                    num_src1_rows++;
+                if (row_id_i != i02) {
+                    continue;
                 }
-            }
 
-            if (num_src1_rows == 0) {
-                continue;
+                num_src1_rows++;
             }
+        }
 
-            ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
-            ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
-            CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
+        if (num_src1_rows == 0) {
+            continue;
+        }
 
-            {
-                dim3 block_dims(std::min((unsigned int)ne10, 768u));
-                dim3 grid_dims(ids->ne[1], n_ids);
-                k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
-                        src1_original, src1_contiguous.get(),
-                        dev_cur_src1_row.get(), dev_row_mapping.get(),
-                        ids_dev, i02, ids->nb[1], ids->nb[0],
-                        ne11, ne10,
-                        nb11, nb12);
-                CUDA_CHECK(cudaGetLastError());
-            }
+        ggml_cuda_pool_alloc<int> dev_cur_src1_row(ctx.pool(), 1);
+        ggml_cuda_pool_alloc<mmid_row_mapping> dev_row_mapping(ctx.pool(), num_src1_rows);
+        CUDA_CHECK(cudaMemsetAsync(dev_cur_src1_row.get(), 0, sizeof(int), stream));
+
+        {
+            dim3 block_dims(std::min((unsigned int)ne10, 768u));
+            dim3 grid_dims(ids->ne[1], n_ids);
+            k_copy_src1_to_contiguous<<<grid_dims, block_dims, 0, stream>>>(
+                    src1_original, src1_contiguous.get(),
+                    dev_cur_src1_row.get(), dev_row_mapping.get(),
+                    ids_dev, i02, ids->nb[1], ids->nb[0],
+                    ne11, ne10,
+                    nb11, nb12);
+            CUDA_CHECK(cudaGetLastError());
+        }
 
-            src0_row.data = src0_original + i02*nb02;
+        src0_row.data = src0_original + i02*nb02;
 
-            GGML_ASSERT(nb11 == sizeof(float)*ne10);
-            GGML_ASSERT(nb1 == sizeof(float)*ne0);
+        GGML_ASSERT(nb11 == sizeof(float)*ne10);
+        GGML_ASSERT(nb1 == sizeof(float)*ne0);
 
-            src1_row.ne[1] = num_src1_rows;
-            src1_row.nb[1] = nb11;
-            src1_row.nb[2] = num_src1_rows*nb11;
-            src1_row.nb[3] = num_src1_rows*nb11;
+        src1_row.ne[1] = num_src1_rows;
+        src1_row.nb[1] = nb11;
+        src1_row.nb[2] = num_src1_rows*nb11;
+        src1_row.nb[3] = num_src1_rows*nb11;
 
-            dst_row.ne[1] = num_src1_rows;
-            dst_row.nb[1] = nb1;
-            dst_row.nb[2] = num_src1_rows*nb1;
-            dst_row.nb[3] = num_src1_rows*nb1;
+        dst_row.ne[1] = num_src1_rows;
+        dst_row.nb[1] = nb1;
+        dst_row.nb[2] = num_src1_rows*nb1;
+        dst_row.nb[3] = num_src1_rows*nb1;
 
-            ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
+        ggml_cuda_mul_mat(ctx, &src0_row, &src1_row, &dst_row);
 
-            {
-                dim3 block_dims(std::min((unsigned int)ne0, 768u));
-                dim3 grid_dims(num_src1_rows);
-                k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
-                        dst_original, dst_contiguous.get(),
-                        dev_row_mapping.get(),
-                        ne0,
-                        nb1, nb2);
-                CUDA_CHECK(cudaGetLastError());
-            }
+        {
+            dim3 block_dims(std::min((unsigned int)ne0, 768u));
+            dim3 grid_dims(num_src1_rows);
+            k_copy_dst_from_contiguous<<<grid_dims, block_dims, 0, stream>>>(
+                    dst_original, dst_contiguous.get(),
+                    dev_row_mapping.get(),
+                    ne0,
+                    nb1, nb2);
+            CUDA_CHECK(cudaGetLastError());
         }
     }
 }
@@ -2489,7 +2488,7 @@ static bool check_node_graph_compatibility_and_refresh_copy_ops(ggml_backend_cud
 #endif
         }
 
-        if (node->op == GGML_OP_MUL_MAT_ID) {
+        if (node->op == GGML_OP_MUL_MAT_ID && node->ne[2] != 1) {
             use_cuda_graph = false; // This node type is not supported by CUDA graph capture
 #ifndef NDEBUG
             GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported node type\n", __func__);
@@ -3203,9 +3202,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         }
         case GGML_OP_ROPE:
         case GGML_OP_ROPE_BACK: {
-            const size_t ts = ggml_type_size(op->src[0]->type);
-            const int64_t ne0_012 = op->src[0]->ne[0] * op->src[0]->ne[1] * op->src[0]->ne[2];
-            return op->src[0]->nb[0] == ts && op->src[0]->nb[3] == ne0_012*ts;
+            return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
         }
         case GGML_OP_IM2COL:
         case GGML_OP_POOL_2D:
diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu
index b39961cd1154d..d8c385e2399ae 100644
--- a/ggml/src/ggml-cuda/mmv.cu
+++ b/ggml/src/ggml-cuda/mmv.cu
@@ -4,18 +4,23 @@
 
 template <typename T, typename type_acc, int block_size>
 static __global__ void mul_mat_vec(
-        const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
+        const T * __restrict__ x, const float * __restrict__ y, const int32_t * __restrict__ ids, float * __restrict__ dst,
+        const int64_t ncols2, const int64_t nchannels_y, const int64_t stride_row,
         const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
         const int64_t sample_ratio, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst) {
-    const int64_t row       = blockIdx.x;
-    const int64_t channel   = blockIdx.y;
-    const int64_t sample    = blockIdx.z;
-    const int     tid       = threadIdx.x;
-    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
-
-    x   +=  (sample/sample_ratio)*stride_sample_x   + (channel/channel_ratio)*stride_channel_x + row*stride_row;
-    y   +=   sample              *stride_sample_y   +  channel               *stride_channel_y;
-    dst +=   sample              *stride_sample_dst +  channel               *stride_channel_dst;
+    const int64_t row         = blockIdx.x;
+    const int64_t channel_dst = blockIdx.y;
+    const int64_t channel_x   = ids ? ids[channel_dst]          : channel_dst / channel_ratio;
+    const int64_t channel_y   = ids ? channel_dst % nchannels_y : channel_dst;
+    const int64_t sample_dst  = blockIdx.z;
+    const int64_t sample_x    = sample_dst / sample_ratio;
+    const int64_t sample_y    = sample_dst;
+    const int     tid         = threadIdx.x;
+    constexpr int warp_size   = ggml_cuda_get_physical_warp_size();
+
+    x   += sample_x  *stride_sample_x   + channel_x  *stride_channel_x   + row*stride_row;
+    y   += sample_y  *stride_sample_y   + channel_y  *stride_channel_y;
+    dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst;
 
     const float2 * y2 = (const float2 *) y;
 
@@ -31,12 +36,19 @@ static __global__ void mul_mat_vec(
 
     float sumf = 0.0f;
 
-    if constexpr (std::is_same<T, half>::value) {
+    if constexpr (std::is_same<T, float>::value) {
+        const float2 * x2 = (const float2 *) x;
+
+        for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
+            const float2 tmpx = x2[col2];
+            const float2 tmpy = y2[col2];
+            sumf += tmpx.x*tmpy.x;
+            sumf += tmpx.y*tmpy.y;
+        }
+    } else if constexpr (std::is_same<T, half>::value) {
         const half2 * x2 = (const half2 *) x;
 
         if (std::is_same<type_acc, float>::value) {
-            sumf = 0.0f;
-
             for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
                 const float2 tmpx = __half22float2(x2[col2]);
                 const float2 tmpy = y2[col2];
@@ -59,8 +71,6 @@ static __global__ void mul_mat_vec(
         }
     } else if constexpr (std::is_same<T, nv_bfloat16>::value) {
         const int * x2 = (const int *) x;
-        sumf = 0.0f;
-
         for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
             const int    tmpx = x2[col2];
             const float2 tmpy = y2[col2];
@@ -92,17 +102,17 @@ static __global__ void mul_mat_vec(
 
 template <typename T, typename type_acc>
 static void launch_mul_mat_vec_cuda(
-        const T * x, const float * y, float * dst,
-        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
+        const T * x, const float * y, const int32_t * ids, float * dst,
+        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
         const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
-        const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
+        const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
         cudaStream_t stream) {
     GGML_ASSERT(ncols      % 2 == 0);
     GGML_ASSERT(stride_row % 2 == 0);
-    GGML_ASSERT(nchannels_y % nchannels_x == 0);
-    GGML_ASSERT(nsamples_y  % nsamples_x  == 0);
-    const int64_t channel_ratio = nchannels_y / nchannels_x;
-    const int64_t sample_ratio  = nsamples_y  / nsamples_x;
+    GGML_ASSERT(ids || nchannels_dst % nchannels_x == 0);
+    GGML_ASSERT(       nsamples_dst  % nsamples_x  == 0);
+    const int64_t channel_ratio = nchannels_dst / nchannels_x;
+    const int64_t sample_ratio  = nsamples_dst  / nsamples_x;
     int device;
     int warp_size;
 
@@ -124,48 +134,48 @@ static void launch_mul_mat_vec_cuda(
     }
 
     const int smem = warp_size*sizeof(float);
-    const dim3 block_nums(nrows, nchannels_y, nsamples_y);
+    const dim3 block_nums(nrows, nchannels_dst, nsamples_dst);
     const dim3 block_dims(block_size_best, 1, 1);
     switch (block_size_best) {
         case   32: {
             mul_mat_vec<T, type_acc,  32><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         case   64: {
             mul_mat_vec<T, type_acc,  64><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         case   96: {
             mul_mat_vec<T, type_acc,  96><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         case  128: {
             mul_mat_vec<T, type_acc, 128><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         case  160: {
             mul_mat_vec<T, type_acc, 160><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         case  192: {
             mul_mat_vec<T, type_acc, 192><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         case  224: {
             mul_mat_vec<T, type_acc, 224><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         case  256: {
             mul_mat_vec<T, type_acc, 256><<<block_nums, block_dims, smem, stream>>>
-                (x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
+                (x, y, ids, dst, ncols/2, nchannels_y, stride_row, channel_ratio, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
         } break;
         default: {
             GGML_ABORT("fatal error");
@@ -175,28 +185,28 @@ static void launch_mul_mat_vec_cuda(
 
 template<typename T>
 static void mul_mat_vec_cuda(
-        const T * x, const float * y, float * dst,
-        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
+        const T * x, const float * y, const int32_t * ids, float * dst,
+        const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y, const int64_t nchannels_dst,
         const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst, const int64_t nsamples_x,
-        const int64_t nsamples_y, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
+        const int64_t nsamples_dst, const int64_t stride_sample_x, const int64_t stride_sample_y, const int64_t stride_sample_dst,
         enum ggml_prec prec, cudaStream_t stream) {
-    switch (prec) {
-        case GGML_PREC_DEFAULT: {
+    if constexpr(std::is_same<T, half>::value) {
+        if (prec == GGML_PREC_DEFAULT) {
             launch_mul_mat_vec_cuda<T, half>
-                (x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
-        } break;
-        case GGML_PREC_F32: {
-            launch_mul_mat_vec_cuda<T, float>
-                (x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
-                 nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
-        } break;
+                (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+                 stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
+            return;
+        }
     }
+    launch_mul_mat_vec_cuda<T, float>
+        (x, y, ids, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y,
+         stride_channel_dst, nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, stream);
 }
 
-void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGML_ASSERT(src1->type == GGML_TYPE_F32);
-    GGML_ASSERT(dst->type  == GGML_TYPE_F32);
+void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
+    GGML_ASSERT(        src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(!ids ||  ids->type == GGML_TYPE_I32);
+    GGML_ASSERT(         dst->type == GGML_TYPE_F32);
 
     GGML_TENSOR_BINARY_OP_LOCALS;
 
@@ -204,21 +214,24 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
     const size_t ts_src1 = ggml_type_size(src1->type);
     const size_t ts_dst  = ggml_type_size(dst->type);
 
-    GGML_ASSERT(ne11 == 1);
-    GGML_ASSERT(ne12 == ne2);
+    GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for  batch size 1.
     GGML_ASSERT(ne13 == ne3);
 
-    GGML_ASSERT(nb00 == ts_src0);
-    GGML_ASSERT(nb10 == ts_src1);
-    GGML_ASSERT(nb0  == ts_dst);
+    GGML_ASSERT(        nb00       == ts_src0);
+    GGML_ASSERT(        nb10       == ts_src1);
+    GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
+    GGML_ASSERT(        nb0        == ts_dst);
 
     const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
     const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
 
-    const float * src1_d = (const float *) src1->data;
-    float       *  dst_d = (float       *)  dst->data;
+    const float   * src1_d =       (const float   *) src1->data;
+    const int32_t *  ids_d = ids ? (const int32_t *)  ids->data : nullptr;
+    float         *  dst_d =       (float         *)  dst->data;
 
     const int64_t s01 = src0->nb[1] / ts_src0;
+    const int64_t s11 = src1->nb[1] / ts_src1;
+    const int64_t s1  =  dst->nb[1] / ts_dst;
     const int64_t s02 = src0->nb[2] / ts_src0;
     const int64_t s12 = src1->nb[2] / ts_src1;
     const int64_t s2  =  dst->nb[2] / ts_dst;
@@ -226,14 +239,33 @@ void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor *
     const int64_t s13 = src1->nb[3] / ts_src1;
     const int64_t s3  =  dst->nb[3] / ts_dst;
 
+    // For MUL_MAT_ID the memory layout is different than for MUL_MAT:
+    const int64_t ncols_dst          = ids ? ne2  : ne1;
+    const int64_t nchannels_y        = ids ? ne11 : ne12;
+    const int64_t nchannels_dst      = ids ? ne1  : ne2;
+    const int64_t stride_channel_dst = ids ? s1   : s2;
+    const int64_t stride_channel_y   = ids ? s11  : s12;
+
+    GGML_ASSERT(ncols_dst == 1);
+
     switch (src0->type) {
+        case GGML_TYPE_F32: {
+            const float * src0_d = (const float *) src0->data;
+            mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01,
+                ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
+                ne03,              ne3,           s03, s13,              s3,                 prec, ctx.stream());
+        } break;
         case GGML_TYPE_F16: {
             const half * src0_d = (const half *) src0->data;
-            mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream());
+            mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01,
+                ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
+                ne03,              ne3,           s03, s13,              s3,                 prec, ctx.stream());
         } break;
         case GGML_TYPE_BF16: {
             const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0->data;
-            mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, s01, ne02, ne12, s02, s12, s2, ne03, ne13, s03, s13, s3, prec, ctx.stream());
+            mul_mat_vec_cuda(src0_d, src1_d, ids_d, dst_d, ne00, ne01, s01,
+                ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
+                ne03,              ne3,           s03, s13,              s3,                 prec, ctx.stream());
         } break;
         default:
             GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
@@ -262,27 +294,34 @@ void ggml_cuda_op_mul_mat_vec(
     const int64_t stride_row         = ne00;
     const int64_t nchannels_x        = 1;
     const int64_t nchannels_y        = 1;
+    const int64_t nchannels_dst      = 1;
     const int64_t stride_channel_x   = 0;
     const int64_t stride_channel_y   = 0;
     const int64_t stride_channel_dst = 0;
     const int64_t nsamples_x         = 1;
-    const int64_t nsamples_y         = 1;
+    const int64_t nsamples_dst       = 1;
     const int64_t stride_sample_x    = 0;
     const int64_t stride_sample_y    = 0;
     const int64_t stride_sample_dst  = 0;
 
     switch (src0->type) {
+        case GGML_TYPE_F32: {
+            const float * src0_d = (const float *) src0_dd_i;
+            mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
+                nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
+        } break;
         case GGML_TYPE_F16: {
             const half * src0_d = (const half *) src0_dd_i;
-            mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
-                nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
+            mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
+                nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
         } break;
         case GGML_TYPE_BF16: {
             const nv_bfloat16 * src0_d = (const nv_bfloat16 *) src0_dd_i;
-            mul_mat_vec_cuda(src0_d, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
-                nchannels_x, nchannels_y, stride_channel_x, stride_channel_y, stride_channel_dst,
-                nsamples_x, nsamples_y, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
+            mul_mat_vec_cuda(src0_d, src1_ddf_i, nullptr, dst_dd_i, ne00, row_diff, stride_row,
+                nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst, prec, stream);
         } break;
         default:
             GGML_ABORT("unsupported type: %s", ggml_type_name(src0->type));
diff --git a/ggml/src/ggml-cuda/mmv.cuh b/ggml/src/ggml-cuda/mmv.cuh
index 78a1cd4a6906a..756e7e1cc7fc3 100644
--- a/ggml/src/ggml-cuda/mmv.cuh
+++ b/ggml/src/ggml-cuda/mmv.cuh
@@ -3,7 +3,7 @@
 // maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
 #define MMV_MAX_ROWS 512
 
-void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
 
 void ggml_cuda_op_mul_mat_vec(
     ggml_backend_cuda_context & ctx,
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index eef8585a7380a..cac04916cd8f0 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -1,6 +1,9 @@
 #include "mmvq.cuh"
+#include "quantize.cuh"
 #include "vecdotq.cuh"
 
+#include <cstdint>
+
 typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
 
 static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
@@ -73,9 +76,9 @@ static __host__ mmvq_parameter_table_id get_device_table_id(int cc) {
     return MMVQ_PARAMETERS_GENERIC;
 }
 
-static constexpr __host__ __device__ int calc_nwarps(int ncols_y,  mmvq_parameter_table_id table_id) {
+static constexpr __host__ __device__ int calc_nwarps(int ncols_dst,  mmvq_parameter_table_id table_id) {
     if (table_id == MMVQ_PARAMETERS_GENERIC) {
-        switch (ncols_y) {
+        switch (ncols_dst) {
             case 1:
             case 2:
             case 3:
@@ -90,7 +93,7 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_y,  mmvq_paramete
                 return 1;
         }
     } else if (table_id == MMVQ_PARAMETERS_GCN) {
-        switch (ncols_y) {
+        switch (ncols_dst) {
             case 1:
             case 2:
             case 3:
@@ -107,9 +110,9 @@ static constexpr __host__ __device__ int calc_nwarps(int ncols_y,  mmvq_paramete
     return 1;
 }
 
-static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int table_id) {
+static constexpr __host__ __device__ int calc_rows_per_block(int ncols_dst, int table_id) {
     if (table_id == MMVQ_PARAMETERS_GENERIC || table_id == MMVQ_PARAMETERS_GCN) {
-        switch (ncols_y) {
+        switch (ncols_dst) {
             case 1:
                 return 1;
             case 2:
@@ -127,19 +130,21 @@ static constexpr __host__ __device__ int calc_rows_per_block(int ncols_y, int ta
     return 1;
 }
 
-template <ggml_type type, int ncols_y>
+template <ggml_type type, int ncols_dst>
 // tell the compiler to use as many registers as it wants, see nwarps definition below
-__launch_bounds__(calc_nwarps(ncols_y, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
+__launch_bounds__(calc_nwarps(ncols_dst, get_device_table_id())*ggml_cuda_get_physical_warp_size(), 1)
 static __global__ void mul_mat_vec_q(
-    const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
+        const void * __restrict__ vx, const void * __restrict__ vy, const int32_t * __restrict__ ids, float * __restrict__ dst,
+        const int ncols_x, const int nchannels_y, const int stride_row_x, const int stride_col_y, const int stride_col_dst,
+        const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
+        const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
 
     constexpr int qk  = ggml_cuda_type_traits<type>::qk;
     constexpr int qi  = ggml_cuda_type_traits<type>::qi;
     constexpr int vdr = get_vdr_mmvq(type);
     constexpr mmvq_parameter_table_id table_id = get_device_table_id();
-    constexpr int nwarps = calc_nwarps(ncols_y, table_id);
-    constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_y, table_id);
+    constexpr int nwarps = calc_nwarps(ncols_dst, table_id);
+    constexpr int rows_per_cuda_block = calc_rows_per_block(ncols_dst, table_id);
     constexpr int warp_size = ggml_cuda_get_physical_warp_size();
 
     constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
@@ -147,13 +152,21 @@ static __global__ void mul_mat_vec_q(
     const     int tid = warp_size*threadIdx.y + threadIdx.x;
     const     int row0 = rows_per_cuda_block*blockIdx.x;
     const     int blocks_per_row_x = ncols_x / qk;
-    const     int blocks_per_col_y = nrows_y / QK8_1;
     constexpr int blocks_per_iter = vdr * nwarps*warp_size / qi;
 
+    // The MUL_MAT_ID code path with ids != nullptr is only implemetned for ncols_dst == 1.
+    const int channel_dst = blockIdx.y;
+    const int channel_x   = ncols_dst == 1 && ids ? ids[channel_dst]          : channel_dst / channel_ratio;
+    const int channel_y   = ncols_dst == 1 && ids ? channel_dst % nchannels_y : channel_dst;
+    const int sample_dst  = blockIdx.z;
+    const int sample_x    = sample_dst / sample_ratio;
+    const int sample_y    = sample_dst;
+
     // partial sum for each thread
-    float tmp[ncols_y][rows_per_cuda_block] = {{0.0f}};
+    float tmp[ncols_dst][rows_per_cuda_block] = {{0.0f}};
 
-    const block_q8_1 * y = (const block_q8_1 *) vy;
+    const block_q8_1 * y = ((const block_q8_1 *) vy) + sample_y*stride_sample_y + channel_y*stride_channel_y;
+    const int kbx_offset = sample_x*stride_sample_x + channel_x*stride_channel_x + row0*stride_row_x;
 
     for (int kbx = tid / (qi/vdr); kbx < blocks_per_row_x; kbx += blocks_per_iter) {
         const int kby = kbx * (qk/QK8_1); // y block index that aligns with kbx
@@ -162,18 +175,19 @@ static __global__ void mul_mat_vec_q(
         const int kqs = vdr * (tid % (qi/vdr));
 
 #pragma unroll
-        for (int j = 0; j < ncols_y; ++j) {
+        for (int j = 0; j < ncols_dst; ++j) {
 #pragma unroll
             for (int i = 0; i < rows_per_cuda_block; ++i) {
-                tmp[j][i] += vec_dot_q_cuda(vx, &y[j*blocks_per_col_y + kby], (row0 + i)*blocks_per_row_x + kbx, kqs);
+                tmp[j][i] += vec_dot_q_cuda(
+                    vx, &y[j*stride_col_y + kby], kbx_offset + i*stride_row_x + kbx, kqs);
             }
         }
     }
 
-    __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_y][rows_per_cuda_block][warp_size];
+    __shared__ float tmp_shared[nwarps-1 > 0 ? nwarps-1 : 1][ncols_dst][rows_per_cuda_block][warp_size];
     if (threadIdx.y > 0) {
 #pragma unroll
-        for (int j = 0; j < ncols_y; ++j) {
+        for (int j = 0; j < ncols_dst; ++j) {
 #pragma unroll
             for (int i = 0; i < rows_per_cuda_block; ++i) {
                 tmp_shared[threadIdx.y-1][j][i][threadIdx.x] = tmp[j][i];
@@ -185,9 +199,11 @@ static __global__ void mul_mat_vec_q(
         return;
     }
 
+    dst += sample_dst*stride_sample_dst + channel_dst*stride_channel_dst + row0;
+
     // sum up partial sums and write back result
 #pragma unroll
-    for (int j = 0; j < ncols_y; ++j) {
+    for (int j = 0; j < ncols_dst; ++j) {
 #pragma unroll
         for (int i = 0; i < rows_per_cuda_block; ++i) {
 #pragma unroll
@@ -197,88 +213,121 @@ static __global__ void mul_mat_vec_q(
             tmp[j][i] = warp_reduce_sum<warp_size>(tmp[j][i]);
         }
 
-        if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < (unsigned)nrows_dst)) {
-            dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
+        if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + int(threadIdx.x) < stride_col_dst)) {
+            dst[j*stride_col_dst + threadIdx.x] = tmp[j][threadIdx.x];
         }
     }
-
-    GGML_UNUSED(nrows_x);
 }
 
-static std::pair<dim3, dim3> calc_launch_params(const int ncols_y, const int nrows_x, const int warp_size, const mmvq_parameter_table_id table_id) {
-    const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_y, table_id) - 1) / calc_rows_per_block(ncols_y, table_id);
-    const dim3 block_nums(nblocks, 1, 1);
-    const dim3 block_dims(warp_size, calc_nwarps(ncols_y, table_id), 1);
+static std::pair<dim3, dim3> calc_launch_params(
+        const int ncols_dst, const int nrows_x, const int nchannels_y, const int nsamples_y,
+        const int warp_size, const mmvq_parameter_table_id table_id) {
+    const int64_t nblocks = (nrows_x + calc_rows_per_block(ncols_dst, table_id) - 1) / calc_rows_per_block(ncols_dst, table_id);
+    const dim3 block_nums(nblocks, nchannels_y, nsamples_y);
+    const dim3 block_dims(warp_size, calc_nwarps(ncols_dst, table_id), 1);
     return {block_nums, block_dims};
 }
 
 template <ggml_type type>
-static void mul_mat_vec_q_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
+static void mul_mat_vec_q_switch_ncols_dst(
+        const void * vx, const void * vy, const int32_t * ids, float * dst,
+        const int ncols_x, const int nrows_x, const int ncols_dst,
+        const int stride_row_x, const int stride_col_y, const int stride_col_dst,
+        const int nchannels_x, const int nchannels_y, const int nchannels_dst,
+        const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
+        const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
+        cudaStream_t stream) {
 
     GGML_ASSERT(ncols_x % ggml_blck_size(type) == 0);
-    GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
+    GGML_ASSERT(ncols_dst <= MMVQ_MAX_BATCH_SIZE);
+
+    const int channel_ratio = nchannels_dst / nchannels_x;
+    const int sample_ratio  = nsamples_dst  / nsamples_x;
 
     const int device = ggml_cuda_get_device();
     const int warp_size = ggml_cuda_info().devices[device].warp_size;
     const mmvq_parameter_table_id table_id = get_device_table_id(ggml_cuda_info().devices[device].cc);
 
-    switch (ncols_y) {
+    GGML_ASSERT(!ids || ncols_dst == 1);
+    switch (ncols_dst) {
         case 1:
         {
-            constexpr int c_ncols_y = 1;
-            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            constexpr int c_ncols_dst = 1;
+            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
             break;
         }
         case 2:
         {
-            constexpr int c_ncols_y = 2;
-            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            constexpr int c_ncols_dst = 2;
+            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
             break;
         }
         case 3:
         {
-            constexpr int c_ncols_y = 3;
-            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            constexpr int c_ncols_dst = 3;
+            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
             break;
         }
         case 4:
         {
-            constexpr int c_ncols_y = 4;
-            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            constexpr int c_ncols_dst = 4;
+            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
             break;
         }
         case 5:
         {
-            constexpr int c_ncols_y = 5;
-            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            constexpr int c_ncols_dst = 5;
+            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
             break;
         }
         case 6:
         {
-            constexpr int c_ncols_y = 6;
-            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            constexpr int c_ncols_dst = 6;
+            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
             break;
         }
         case 7:
         {
-            constexpr int c_ncols_y = 7;
-            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            constexpr int c_ncols_dst = 7;
+            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
             break;
         }
         case 8:
         {
-            constexpr int c_ncols_y = 8;
-            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_y, nrows_x, warp_size, table_id);
-            mul_mat_vec_q<type, c_ncols_y><<<dims.first, dims.second, 0, stream>>>(vx, vy, dst, ncols_x, nrows_x, nrows_y, nrows_dst);
+            constexpr int c_ncols_dst = 8;
+            std::pair<dim3, dim3> dims = calc_launch_params(c_ncols_dst, nrows_x, nchannels_dst, nsamples_dst, warp_size, table_id);
+            mul_mat_vec_q<type, c_ncols_dst><<<dims.first, dims.second, 0, stream>>>
+                (vx, vy, ids, dst, ncols_x, nchannels_y, stride_row_x, stride_col_y, stride_col_dst,
+                 channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
             break;
         }
         default:
@@ -287,221 +336,241 @@ static void mul_mat_vec_q_cuda(
     }
 }
 
-static void mul_mat_vec_q4_0_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_Q4_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_q4_1_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_Q4_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_q5_0_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_Q5_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_q5_1_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_Q5_1>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_q8_0_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_Q8_0>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_q2_K_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_Q2_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_q3_K_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_Q3_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_q4_K_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_Q4_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_q5_K_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_Q5_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_q6_K_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_Q6_K>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_iq2_xxs_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_iq2_xs_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_IQ2_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_iq2_s_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_IQ2_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_iq3_xxs_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_IQ3_XXS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_iq1_s_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_IQ1_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_iq1_m_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_IQ1_M>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_iq4_nl_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_IQ4_NL>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_iq4_xs_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_IQ4_XS>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-static void mul_mat_vec_iq3_s_q8_1_cuda(
-    const void * vx, const void * vy, float * dst,
-    const int ncols_x, const int nrows_x, const int nrows_y, const int ncols_y, const int nrows_dst, cudaStream_t stream) {
-
-    mul_mat_vec_q_cuda<GGML_TYPE_IQ3_S>(vx, vy, dst, ncols_x, nrows_x, nrows_y, ncols_y, nrows_dst, stream);
-}
-
-void ggml_cuda_op_mul_mat_vec_q(
-    ggml_backend_cuda_context & ctx,
-    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
-    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
-    const int64_t src1_padded_row_size, cudaStream_t stream) {
-
-    const int64_t ne00 = src0->ne[0];
-    const int64_t row_diff = row_high - row_low;
-
-    const int64_t ne10 = src1->ne[0];
-    GGML_ASSERT(ne10 % QK8_1 == 0);
-
-    const int64_t ne0 = dst->ne[0];
-
-    int id = ggml_cuda_get_device();
-
-    // the main device has a larger memory buffer to hold the results from all GPUs
-    // nrows_dst == nrows of the matrix that the kernel writes into
-    const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
-
-    switch (src0->type) {
+static void mul_mat_vec_q_switch_type(
+        const void * vx, const ggml_type type_x, const void * vy, const int32_t * ids, float * dst,
+        const int ncols_x, const int nrows_x, const int ncols_dst,
+        const int stride_row_x, const int stride_col_y, const int stride_col_dst,
+        const int nchannels_x, const int nchannels_y, const int nchannels_dst,
+        const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
+        const int nsamples_x, const int nsamples_dst, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
+        cudaStream_t stream) {
+    switch (type_x) {
         case GGML_TYPE_Q4_0:
-            mul_mat_vec_q4_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_0>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_Q4_1:
-            mul_mat_vec_q4_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_1>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_Q5_0:
-            mul_mat_vec_q5_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_0>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_Q5_1:
-            mul_mat_vec_q5_1_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_1>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_Q8_0:
-            mul_mat_vec_q8_0_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q8_0>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_Q2_K:
-            mul_mat_vec_q2_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q2_K>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_Q3_K:
-            mul_mat_vec_q3_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q3_K>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_Q4_K:
-            mul_mat_vec_q4_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q4_K>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_Q5_K:
-            mul_mat_vec_q5_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q5_K>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_Q6_K:
-            mul_mat_vec_q6_K_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_Q6_K>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_IQ2_XXS:
-            mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XXS>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_IQ2_XS:
-            mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_XS>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_IQ2_S:
-            mul_mat_vec_iq2_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ2_S>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_IQ3_XXS:
-            mul_mat_vec_iq3_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_XXS>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_IQ1_S:
-            mul_mat_vec_iq1_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_S>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_IQ1_M:
-            mul_mat_vec_iq1_m_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ1_M>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_IQ4_NL:
-            mul_mat_vec_iq4_nl_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_NL>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_IQ4_XS:
-            mul_mat_vec_iq4_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ4_XS>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         case GGML_TYPE_IQ3_S:
-            mul_mat_vec_iq3_s_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, src1_padded_row_size, src1_ncols, nrows_dst, stream);
+            mul_mat_vec_q_switch_ncols_dst<GGML_TYPE_IQ3_S>
+                (vx, vy, ids, dst, ncols_x, nrows_x, ncols_dst, stride_row_x, stride_col_y, stride_col_dst,
+                 nchannels_x, nchannels_y, nchannels_dst, stride_channel_x, stride_channel_y, stride_channel_dst,
+                 nsamples_x, nsamples_dst, stride_sample_x, stride_sample_y, stride_sample_dst,
+                 stream);
             break;
         default:
             GGML_ABORT("fatal error");
             break;
     }
+}
+
+void ggml_cuda_mul_mat_vec_q(
+        ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst) {
+    GGML_ASSERT(        src1->type == GGML_TYPE_F32);
+    GGML_ASSERT(        dst->type  == GGML_TYPE_F32);
+    GGML_ASSERT(!ids || ids->type  == GGML_TYPE_I32); // Optional, used for batched GGML_MUL_MAT_ID.
+
+    GGML_TENSOR_BINARY_OP_LOCALS;
+
+    cudaStream_t stream = ctx.stream();
+
+    const size_t ts_src0 = ggml_type_size(src0->type);
+    const size_t ts_src1 = ggml_type_size(src1->type);
+    const size_t ts_dst  = ggml_type_size(dst->type);
+
+    GGML_ASSERT(        nb00       == ts_src0);
+    GGML_ASSERT(        nb10       == ts_src1);
+    GGML_ASSERT(        nb0        == ts_dst);
+    GGML_ASSERT(!ids || ids->nb[0] == ggml_type_size(ids->type));
+
+    GGML_ASSERT(!ids || ne12 == 1); // Implementation is only correct for  batch size 1.
+
+    const float   * src1_d =       (const float   *) src1->data;
+    const int32_t *  ids_d = ids ? (const int32_t *)  ids->data : nullptr;
+    float         *  dst_d =       (float         *)  dst->data;
+
+    const int64_t ne10_padded = GGML_PAD(ne10, MATRIX_ROW_PADDING);
+    ggml_cuda_pool_alloc<char> src1_q8_1(ctx.pool(), ne13*ne12 * ne11*ne10_padded * sizeof(block_q8_1)/QK8_1);
+    {
+        const int64_t s11 = src1->nb[1] / ts_src1;
+        const int64_t s12 = src1->nb[2] / ts_src1;
+        const int64_t s13 = src1->nb[3] / ts_src1;
+        quantize_row_q8_1_cuda(src1_d, src1_q8_1.get(), src0->type, ne10, s11, s12, s13, ne10_padded, ne11, ne12, ne13, stream);
+    }
+
+    const int64_t s01 = src0->nb[1] / ts_src0;
+    const int64_t s11 = ne10_padded / QK8_1;
+    const int64_t s1  =  dst->nb[1] / ts_dst;
+    const int64_t s02 = src0->nb[2] / ts_src0;
+    const int64_t s2  =  dst->nb[2] / ts_dst;
+    const int64_t s03 = src0->nb[3] / ts_src0;
+    const int64_t s3  =  dst->nb[3] / ts_dst;
+
+    const int64_t s12 = ne11*s11;
+    const int64_t s13 = ne12*s12;
+
+    // For MUL_MAT_ID the memory layout is different than for MUL_MAT:
+    const int64_t ncols_dst          = ids ? ne2  : ne1;
+    const int64_t nchannels_y        = ids ? ne11 : ne12;
+    const int64_t nchannels_dst      = ids ? ne1  : ne2;
+    const int64_t stride_col_dst     = ids ? s2   : s1;
+    const int64_t stride_col_y       = ids ? s12  : s11;
+    const int64_t stride_channel_dst = ids ? s1   : s2;
+    const int64_t stride_channel_y   = ids ? s11  : s12;
+
+    mul_mat_vec_q_switch_type(
+        src0->data, src0->type, src1_q8_1.get(), ids_d, dst_d, ne00,
+        ne01,              ncols_dst,     s01, stride_col_y,     stride_col_dst,
+        ne02, nchannels_y, nchannels_dst, s02, stride_channel_y, stride_channel_dst,
+        ne03,              ne3,           s03, s13,              s3,                 stream);
+}
+
+void ggml_cuda_op_mul_mat_vec_q(
+    ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
+    const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
+    const int64_t src1_padded_row_size, cudaStream_t stream) {
+
+    const int64_t ne00 = src0->ne[0];
+    const int64_t row_diff = row_high - row_low;
+
+    const int64_t ne10 = src1->ne[0];
+    GGML_ASSERT(ne10 % QK8_1 == 0);
+
+    const int64_t ne0 = dst->ne[0];
+
+    int id = ggml_cuda_get_device();
+
+    // the main device has a larger memory buffer to hold the results from all GPUs
+    // nrows_dst == nrows of the matrix that the kernel writes into
+    const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;
+
+    const int stride_row_x = ne00 / ggml_blck_size(src0->type);
+    const int stride_col_y = src1_padded_row_size / QK8_1;
+
+    mul_mat_vec_q_switch_type(
+        src0_dd_i, src0->type, src1_ddq_i, nullptr, dst_dd_i, ne00, row_diff, src1_ncols, stride_row_x, stride_col_y, nrows_dst,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, stream);
 
     GGML_UNUSED(src1);
     GGML_UNUSED(dst);
diff --git a/ggml/src/ggml-cuda/mmvq.cuh b/ggml/src/ggml-cuda/mmvq.cuh
index d9e42fdd6d16c..39dc7d33eb5ac 100644
--- a/ggml/src/ggml-cuda/mmvq.cuh
+++ b/ggml/src/ggml-cuda/mmvq.cuh
@@ -2,6 +2,9 @@
 
 #define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
 
+void ggml_cuda_mul_mat_vec_q(ggml_backend_cuda_context & ctx,
+    const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
+
 void ggml_cuda_op_mul_mat_vec_q(
     ggml_backend_cuda_context & ctx,
     const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
diff --git a/ggml/src/ggml-cuda/quantize.cu b/ggml/src/ggml-cuda/quantize.cu
index 1702e4ce2feba..3bab47d56a22e 100644
--- a/ggml/src/ggml-cuda/quantize.cu
+++ b/ggml/src/ggml-cuda/quantize.cu
@@ -1,30 +1,40 @@
 #include "quantize.cuh"
 #include <cstdint>
 
-static __global__ void quantize_q8_1(const float * __restrict__ x, void * __restrict__ vy, const int64_t kx, const int64_t kx0_padded) {
-    const int64_t ix0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
+static __global__ void quantize_q8_1(
+        const float * __restrict__ x, void * __restrict__ vy,
+        const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
+        const int64_t ne0, const int ne1, const int ne2) {
+    const int64_t i0 = (int64_t)blockDim.x*blockIdx.x + threadIdx.x;
 
-    if (ix0 >= kx0_padded) {
+    if (i0 >= ne0) {
         return;
     }
 
-    const int64_t ix1 = blockIdx.y;
+    const int64_t i1 = blockIdx.y;
+    const int64_t i2 = blockIdx.z % ne2;
+    const int64_t i3 = blockIdx.z / ne2;
+
+    const int64_t & i00 = i0;
+    const int64_t & i01 = i1;
+    const int64_t & i02 = i2;
+    const int64_t & i03 = i3;
 
-    const int64_t i_padded = ix1*kx0_padded + ix0;
+    const int64_t i_cont = ((i3*ne2 + i2) * ne1 + i1) * ne0 + i0;
 
     block_q8_1 * y = (block_q8_1 *) vy;
 
-    const int64_t ib = i_padded / QK8_1; // block index
-    const int64_t iqs = i_padded % QK8_1; // quant index
+    const int64_t ib  = i_cont / QK8_1; // block index
+    const int64_t iqs = i_cont % QK8_1; // quant index
 
-    const float xi = ix0 < kx ? x[ix1*kx + ix0] : 0.0f;
+    const float xi = i0 < ne00 ? x[i03*s03 + i02*s02 + i01*s01 + i00] : 0.0f;
     float amax = fabsf(xi);
     float sum = xi;
 
     amax = warp_reduce_max(amax);
-    sum = warp_reduce_sum(sum);
+    sum  = warp_reduce_sum(sum);
 
-    const float d = amax / 127;
+    const float  d = amax / 127;
     const int8_t q = amax == 0.0f ? 0 : roundf(xi / d);
 
     y[ib].qs[iqs] = q;
@@ -127,43 +137,45 @@ static __global__ void quantize_mmq_q8_1(
 }
 
 void quantize_row_q8_1_cuda(
-    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
-    const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
+    const float * x, void * vy, const ggml_type type_src0, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
+    const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
 
-    GGML_ASSERT(kx0_padded % QK8_1 == 0);
+    GGML_ASSERT(ne0 % QK8_1 == 0);
 
-    const int64_t block_num_x = (kx0_padded + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
-    const dim3 num_blocks(block_num_x, kx1*channels, 1);
+    const int64_t block_num_x = (ne0 + CUDA_QUANTIZE_BLOCK_SIZE - 1) / CUDA_QUANTIZE_BLOCK_SIZE;
+    const dim3 num_blocks(block_num_x, ne1, ne2*ne3);
     const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE, 1, 1);
-    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx0_padded);
-
-    GGML_UNUSED(type_x);
+    quantize_q8_1<<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, s01, s02, s03, ne0, ne1, ne2);
+    GGML_UNUSED(type_src0);
 }
 
 void quantize_mmq_q8_1_cuda(
-    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels,
-    const int64_t kx0_padded, const ggml_type type_x, cudaStream_t stream) {
+    const float * x, void * vy, const ggml_type type_src0, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
+    const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream) {
 
-    GGML_ASSERT(kx0_padded % (4*QK8_1) == 0);
+    GGML_ASSERT(ne0 % (4*QK8_1) == 0);
 
-    const int64_t block_num_x = (kx0_padded + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
-    const dim3 num_blocks(block_num_x, kx1, channels);
+    const int64_t block_num_x = (ne0 + 4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ - 1) / (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ);
+    const dim3 num_blocks(block_num_x, ne1, ne2*ne3);
     const dim3 block_size(CUDA_QUANTIZE_BLOCK_SIZE_MMQ, 1, 1);
-    switch (mmq_get_q8_1_ds_layout(type_x)) {
+    switch (mmq_get_q8_1_ds_layout(type_src0)) {
         case MMQ_Q8_1_DS_LAYOUT_D4:
             quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D4>
-                <<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
+                <<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, ne1, ne0);
             break;
         case MMQ_Q8_1_DS_LAYOUT_DS4:
             quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_DS4>
-                <<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
+                <<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, ne1, ne0);
             break;
         case MMQ_Q8_1_DS_LAYOUT_D2S6:
             quantize_mmq_q8_1<MMQ_Q8_1_DS_LAYOUT_D2S6>
-                <<<num_blocks, block_size, 0, stream>>>(x, vy, kx0, kx1, kx0_padded);
+                <<<num_blocks, block_size, 0, stream>>>(x, vy, ne00, ne1, ne0);
             break;
         default:
             GGML_ABORT("fatal error");
             break;
     }
+    GGML_UNUSED(s01);
+    GGML_UNUSED(s02);
+    GGML_UNUSED(s03);
 }
diff --git a/ggml/src/ggml-cuda/quantize.cuh b/ggml/src/ggml-cuda/quantize.cuh
index 03bf322b95873..b627c4e4008b4 100644
--- a/ggml/src/ggml-cuda/quantize.cuh
+++ b/ggml/src/ggml-cuda/quantize.cuh
@@ -12,13 +12,13 @@ static_assert(MATRIX_ROW_PADDING %    CUDA_QUANTIZE_BLOCK_SIZE      == 0, "Risk
 static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
 
 typedef void (*quantize_cuda_t)(
-    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
-    const ggml_type type_x, cudaStream_t stream);
+    const float * x, void * vy, const ggml_type type_src0, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
+    const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream);
 
 void quantize_row_q8_1_cuda(
-    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
-    const ggml_type type_x, cudaStream_t stream);
+    const float * x, void * vy, const ggml_type type_src0, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
+    const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream);
 
 void quantize_mmq_q8_1_cuda(
-    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
-    const ggml_type type_x, cudaStream_t stream);
+    const float * x, void * vy, const ggml_type type_src0, const int64_t ne00, const int64_t s01, const int64_t s02, const int64_t s03,
+    const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3, cudaStream_t stream);
diff --git a/ggml/src/ggml-cuda/vecdotq.cuh b/ggml/src/ggml-cuda/vecdotq.cuh
index 40091a0ef07b4..ba195e1d100d3 100644
--- a/ggml/src/ggml-cuda/vecdotq.cuh
+++ b/ggml/src/ggml-cuda/vecdotq.cuh
@@ -1,3 +1,5 @@
+#pragma once
+
 #include "common.cuh"
 #include <cstdint>
 
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 5f6f87d1a3a7b..61751755b317b 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2071,7 +2071,7 @@ struct test_mul_mat_id : public test_case {
     const ggml_type type_b;
     const int n_mats;
     const int n_used;
-    const bool b; // brodcast b matrix
+    const bool b; // broadcast b matrix
     const int64_t m;
     const int64_t n;
     const int64_t k;

From 2cca6c01e46d2fc1124d15730273ed2acdad1016 Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Wed, 23 Apr 2025 10:32:49 +0300
Subject: [PATCH 013/200] rpc : add command line option for number of threads
 for the CPU backend (#13060)

closes #13051
---
 examples/rpc/rpc-server.cpp | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/examples/rpc/rpc-server.cpp b/examples/rpc/rpc-server.cpp
index 9db5542570de8..0277e25cb5ec2 100644
--- a/examples/rpc/rpc-server.cpp
+++ b/examples/rpc/rpc-server.cpp
@@ -22,6 +22,7 @@
 
 #include "ggml-rpc.h"
 #ifdef _WIN32
+#  define NOMINMAX
 #  define DIRECTORY_SEPARATOR '\\'
 #  include <locale>
 #  include <windows.h>
@@ -37,6 +38,8 @@
 #include <stdio.h>
 #include <vector>
 #include <filesystem>
+#include <algorithm>
+#include <thread>
 
 namespace fs = std::filesystem;
 
@@ -150,12 +153,14 @@ struct rpc_server_params {
     int         port        = 50052;
     size_t      backend_mem = 0;
     bool        use_cache   = false;
+    int         n_threads   = std::max(1U, std::thread::hardware_concurrency()/2);
 };
 
 static void print_usage(int /*argc*/, char ** argv, rpc_server_params params) {
     fprintf(stderr, "Usage: %s [options]\n\n", argv[0]);
     fprintf(stderr, "options:\n");
     fprintf(stderr, "  -h, --help                show this help message and exit\n");
+    fprintf(stderr, "  -t,      --threads        number of threads for the CPU backend (default: %d)\n", params.n_threads);
     fprintf(stderr, "  -H HOST, --host HOST      host to bind to (default: %s)\n", params.host.c_str());
     fprintf(stderr, "  -p PORT, --port PORT      port to bind to (default: %d)\n", params.port);
     fprintf(stderr, "  -m MEM,  --mem MEM        backend memory size (in MB)\n");
@@ -172,6 +177,15 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
                 return false;
             }
             params.host = argv[i];
+        } else if (arg == "-t" || arg == "--threads") {
+            if (++i >= argc) {
+                return false;
+            }
+            params.n_threads = std::stoi(argv[i]);
+            if (params.n_threads <= 0) {
+                fprintf(stderr, "error: invalid number of threads: %d\n", params.n_threads);
+                return false;
+            }
         } else if (arg == "-p" || arg == "--port") {
             if (++i >= argc) {
                 return false;
@@ -199,7 +213,7 @@ static bool rpc_server_params_parse(int argc, char ** argv, rpc_server_params &
     return true;
 }
 
-static ggml_backend_t create_backend() {
+static ggml_backend_t create_backend(const rpc_server_params & params) {
     ggml_backend_t backend = NULL;
 #ifdef GGML_USE_CUDA
     fprintf(stderr, "%s: using CUDA backend\n", __func__);
@@ -231,6 +245,7 @@ static ggml_backend_t create_backend() {
     if (!backend) {
         fprintf(stderr, "%s: using CPU backend\n", __func__);
         backend = ggml_backend_cpu_init();
+        ggml_backend_cpu_set_n_threads(backend, params.n_threads);
     }
     return backend;
 }
@@ -275,7 +290,7 @@ int main(int argc, char * argv[]) {
         fprintf(stderr, "\n");
     }
 
-    ggml_backend_t backend = create_backend();
+    ggml_backend_t backend = create_backend(params);
     if (!backend) {
         fprintf(stderr, "Failed to create backend\n");
         return 1;

From eb1776b15a32d832f1266deeeab75b9d255c5849 Mon Sep 17 00:00:00 2001
From: piDack <104877312+piDack@users.noreply.github.com>
Date: Wed, 23 Apr 2025 22:59:14 +0800
Subject: [PATCH 014/200] convert : Append mult-eos,half-rope,bos to GLM4-0414
 and Z (#13021)

* append mult-eos,half-rope,bos to GLM4-0414

* remove unset var
---
 convert_hf_to_gguf.py | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 645bdad9b57d2..04131319b1b8d 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5079,10 +5079,25 @@ class Glm4Model(TextModel):
     model_arch = gguf.MODEL_ARCH.GLM4
 
     def set_vocab(self):
-        self._set_vocab_gpt2()
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        tokens, toktypes, tokpre = self.get_vocab_base()
+        self.gguf_writer.add_tokenizer_model("gpt2")
+        self.gguf_writer.add_tokenizer_pre(tokpre)
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_types(toktypes)
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
+        special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
+        special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
+        special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"])
+        special_vocab.add_to_gguf(self.gguf_writer)
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
+        rope_dim = self.hparams["head_dim"]
+        self.gguf_writer.add_rope_dimension_count(int(rope_dim * self.hparams.get("partial_rotary_factor", 0.5)))
         if self.hparams.get("rope_scaling") is not None and "factor" in self.hparams["rope_scaling"]:
             if self.hparams["rope_scaling"].get("type") == "yarn":
                 self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.YARN)

From ecda2ec4b347031a9b8a89ee2efc664ce63f599c Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Wed, 23 Apr 2025 20:21:59 +0200
Subject: [PATCH 015/200] mtmd : Support Pixtral 12B (#13065)

* add pixtral text model (vision is wip)

* cgraph ok, just missing 2D RoPE

* fix bad rebase

* first working version

* fix problem with img_break token

* support dynamic image size

* update docs

* update test script
---
 convert_hf_to_gguf.py              |  66 +++++-
 convert_hf_to_gguf_update.py       |   1 +
 docs/multimodal/gemma3.md          |  12 +-
 examples/llava/README.md           |  28 +++
 examples/llava/clip-impl.h         |   4 +
 examples/llava/clip.cpp            | 345 +++++++++++++++++++++++++++--
 examples/llava/mtmd.cpp            |  16 +-
 examples/llava/tests.sh            |  17 ++
 gguf-py/gguf/constants.py          |   7 +
 gguf-py/gguf/tensor_mapping.py     |  18 ++
 include/llama.h                    |   1 +
 models/ggml-vocab-pixtral.gguf.inp | 112 ++++++++++
 models/ggml-vocab-pixtral.gguf.out |  46 ++++
 src/llama-vocab.cpp                |   3 +-
 14 files changed, 644 insertions(+), 32 deletions(-)
 create mode 100644 models/ggml-vocab-pixtral.gguf.inp
 create mode 100644 models/ggml-vocab-pixtral.gguf.out

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 04131319b1b8d..cf35fb86ecfec 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -776,6 +776,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
             # ref: https://huggingface.co/THUDM/glm-4-9b-hf
             res = "glm4"
+        if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
+            # ref: https://huggingface.co/mistral-community/pixtral-12b
+            res = "pixtral"
 
         if res is None:
             logger.warning("\n")
@@ -1724,7 +1727,8 @@ def prepare_tensors(self):
     "MistralForCausalLM",
     "MixtralForCausalLM",
     "Idefics3ForConditionalGeneration",
-    "SmolVLMForConditionalGeneration")
+    "SmolVLMForConditionalGeneration",
+    "LlavaForConditionalGeneration")
 class LlamaModel(TextModel):
     model_arch = gguf.MODEL_ARCH.LLAMA
     undo_permute = True
@@ -1734,6 +1738,10 @@ def __init__(self, *args, **kwargs):
         # fix for SmolVLM2, missing `num_attention_heads` in config.json
         if self.hparams["architectures"][0] == "SmolVLMForConditionalGeneration":
             self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
+        # fix for Pixtral, missing `num_attention_heads` in config.json
+        if self.hparams["architectures"][0] == "LlavaForConditionalGeneration" \
+                and self.hparams.get("model_type") == "mistral":
+            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
 
     def set_vocab(self):
         try:
@@ -1797,12 +1805,17 @@ def permute(weights: Tensor, n_head: int, n_head_kv: int | None):
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         n_head = self.hparams["num_attention_heads"]
         n_kv_head = self.hparams.get("num_key_value_heads")
-        is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
+        is_vision_tensor = "vision_tower" in name \
+            or "vision_model" in name \
+            or "model.connector" in name \
+            or "multi_modal_projector" in name
 
         if is_vision_tensor:
             return [] # skip vision tensors
         elif name.startswith("model.text_model"):
             name = name.replace("text_model.", "") # for SmolVLM
+        elif name.startswith("language_model."):
+            name = name.replace("language_model.", "") # for the rest
 
         if self.undo_permute:
             if name.endswith(("q_proj.weight", "q_proj.bias")):
@@ -1885,6 +1898,55 @@ def prepare_tensors(self):
                 raise ValueError(f"Unprocessed experts: {experts}")
 
 
+@ModelBase.register("LlavaForConditionalGeneration")
+class LlavaVisionModel(VisionModel):
+    img_break_tok_id = -1
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if self.hparams["model_type"] == "pixtral":
+            # fix missing config.json values
+            self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
+            self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 24)
+            self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 4096)
+            self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1024)
+            self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
+            self.img_break_tok_id = 12 # see tokenizer_config.json
+        else:
+            raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        hparams = self.hparams
+        if hparams["model_type"] == "pixtral":
+            self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL)
+            # default values below are taken from HF tranformers code
+            self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
+            self.gguf_writer.add_vision_use_silu(True)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+        n_head = self.hparams["num_attention_heads"]
+        n_kv_head = n_head
+
+        if name.startswith("multi_modal_projector.") or name.startswith("vision_tower."):
+            # process vision tensors
+            if name.endswith(("q_proj.weight", "q_proj.bias")):
+                data_torch = LlamaModel.permute(data_torch, n_head, n_head)
+            if name.endswith(("k_proj.weight", "k_proj.bias")):
+                data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
+            return [(self.map_tensor_name(name), data_torch)]
+
+        if self.img_break_tok_id > 0 and "embed_tokens.weight" in name:
+            logger.info(f"Extracting [IMG_BREAK] token embedding from {name}")
+            # for pixtral model, we need to extract the [IMG_BREAK] token embedding
+            img_break_embd = data_torch[self.img_break_tok_id]
+            name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK]
+            return [(self.map_tensor_name(name), img_break_embd)]
+
+        return [] # skip other tensors
+
+
 @ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
 class SmolVLMModel(VisionModel):
     def __init__(self, *args, **kwargs):
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
index 160c9fe0e616a..03a1d8d8c9b42 100755
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -115,6 +115,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "bailingmoe",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-lite", },
     {"name": "llama4",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct", },
     {"name": "glm4",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/THUDM/glm-4-9b-hf", },
+    {"name": "pixtral",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
 ]
 
 
diff --git a/docs/multimodal/gemma3.md b/docs/multimodal/gemma3.md
index 8fa077de71985..110a36f40835d 100644
--- a/docs/multimodal/gemma3.md
+++ b/docs/multimodal/gemma3.md
@@ -11,15 +11,15 @@ You can use pre-quantized model from [ggml-org](https://huggingface.co/ggml-org)
 ```bash
 # build
 cmake -B build
-cmake --build build --target llama-gemma3-cli
+cmake --build build --target llama-mtmd-cli
 
 # alternatively, install from brew (MacOS)
 brew install llama.cpp
 
 # run it
-llama-gemma3-cli -hf ggml-org/gemma-3-4b-it-GGUF
-llama-gemma3-cli -hf ggml-org/gemma-3-12b-it-GGUF
-llama-gemma3-cli -hf ggml-org/gemma-3-27b-it-GGUF
+llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
+llama-mtmd-cli -hf ggml-org/gemma-3-12b-it-GGUF
+llama-mtmd-cli -hf ggml-org/gemma-3-27b-it-GGUF
 
 # note: 1B model does not support vision
 ```
@@ -44,8 +44,8 @@ What you need:
 ```bash
 # build
 cmake -B build
-cmake --build build --target llama-gemma3-cli
+cmake --build build --target llama-mtmd-cli
 
 # run it
-./build/bin/llama-gemma3-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
+./build/bin/llama-mtmd-cli -m {text_model}.gguf --mmproj mmproj.gguf --image your_image.jpg
 ```
diff --git a/examples/llava/README.md b/examples/llava/README.md
index cadbc53fab0d7..f58d9de7107e8 100644
--- a/examples/llava/README.md
+++ b/examples/llava/README.md
@@ -14,6 +14,28 @@ The naming and structure related to multimodal support have evolved, which might
 - [#12849](https://github.com/ggml-org/llama.cpp/pull/12849): `libmtmd` was introduced as a replacement for `llava.cpp`. Its goals include providing a single, unified command-line interface, improving the user/developer experience (UX/DX), and supporting both audio and image inputs.
 - [#13012](https://github.com/ggml-org/llama.cpp/pull/13012): `mtmd-cli` was added, consolidating the various model-specific CLIs into a single tool powered by `libmtmd`.
 
+## Pre-quantized models
+
+These are ready-to-use models, most of them come with `Q4_K_M` quantization by default:
+
+```sh
+# Gemma 3
+llama-mtmd-cli -hf ggml-org/gemma-3-4b-it-GGUF
+llama-mtmd-cli -hf ggml-org/gemma-3-12b-it-GGUF
+llama-mtmd-cli -hf ggml-org/gemma-3-27b-it-GGUF
+
+# SmolVLM
+llama-mtmd-cli -hf ggml-org/SmolVLM-Instruct-GGUF
+llama-mtmd-cli -hf ggml-org/SmolVLM-256M-Instruct-GGUF
+llama-mtmd-cli -hf ggml-org/SmolVLM-500M-Instruct-GGUF
+llama-mtmd-cli -hf ggml-org/SmolVLM2-2.2B-Instruct-GGUF
+llama-mtmd-cli -hf ggml-org/SmolVLM2-256M-Video-Instruct-GGUF
+llama-mtmd-cli -hf ggml-org/SmolVLM2-500M-Video-Instruct-GGUF
+
+# Pixtral 12B
+llama-mtmd-cli -hf ggml-org/pixtral-12b-GGUF
+```
+
 ## How it works and what is `mmproj`?
 
 Multimodal support in `llama.cpp` works by encoding images into embeddings using a separate model component, and then feeding these embeddings into the language model.
@@ -45,3 +67,9 @@ Multimodal projector (`mmproj`) files are specific to each model architecture. P
 - [MiniCPM-o 2.6](../../docs/multimodal/minicpmo2.6.md)
 - [IBM Granite Vision](../../docs/multimodal/granitevision.md)
 - [Google Gemma 3](../../docs/multimodal/gemma3.md)
+
+For the following models, you can use `convert_hf_to_gguf.py`with `--mmproj` flag to get the `mmproj` file:
+- [Gemma 3](https://huggingface.co/collections/google/gemma-3-release-67c6c6f89c4f76621268bb6d) - Note: 1B variant does not have vision support
+- SmolVLM (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
+- SmolVLM2 (from [HuggingFaceTB](https://huggingface.co/HuggingFaceTB))
+- [Pixtral 12B](https://huggingface.co/mistral-community/pixtral-12b) - only works with `transformers`-compatible checkpoint
diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h
index faa8a9d5e9dbb..8d310fb0271c5 100644
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@@ -60,6 +60,7 @@
 #define TN_ATTN_V          "%s.blk.%d.attn_v.%s"
 #define TN_ATTN_OUTPUT     "%s.blk.%d.attn_out.%s"
 #define TN_FFN_DOWN        "%s.blk.%d.ffn_down.%s"
+#define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s"
 #define TN_FFN_UP          "%s.blk.%d.ffn_up.%s"
 #define TN_LN_1            "%s.blk.%d.ln1.%s"
 #define TN_LN_2            "%s.blk.%d.ln2.%s"
@@ -73,6 +74,7 @@
 #define TN_MM_INP_PROJ     "mm.input_projection.weight" // gemma3
 #define TN_MM_SOFT_EMB_N   "mm.soft_emb_norm.weight"    // gemma3
 #define TN_MM_PROJECTOR    "mm.model.fc.weight"         // idefics3
+#define TN_TOK_IMG_BREAK   "v.token_embd.img_break"     // pixtral
 
 // mimicpmv
 #define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
@@ -101,6 +103,7 @@ enum projector_type {
     PROJECTOR_TYPE_MERGER,
     PROJECTOR_TYPE_GEMMA3,
     PROJECTOR_TYPE_IDEFICS3,
+    PROJECTOR_TYPE_PIXTRAL,
     PROJECTOR_TYPE_UNKNOWN,
 };
 
@@ -113,6 +116,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_MERGER,    "qwen2vl_merger"},
     { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
     { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
+    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
 };
 
 static projector_type clip_projector_type_from_string(const std::string & str) {
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index ad39e1bd6194c..4eec4a2646798 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -163,7 +163,8 @@ struct clip_hparams {
 
     patch_merge_type mm_patch_merge_type = PATCH_MERGE_FLAT;
 
-    float eps;
+    float eps = 1e-6;
+    float rope_theta = 0.0;
 
     std::vector<int32_t> image_grid_pinpoints;
     int32_t image_crop_resolution;
@@ -187,11 +188,17 @@ struct clip_layer {
     struct ggml_tensor * ln_1_b = nullptr;
 
     // ff
-    struct ggml_tensor * ff_i_w = nullptr;
-    struct ggml_tensor * ff_i_b = nullptr;
-
-    struct ggml_tensor * ff_o_w = nullptr;
-    struct ggml_tensor * ff_o_b = nullptr;
+    struct ggml_tensor * ff_i_w = nullptr; // legacy naming
+    struct ggml_tensor * ff_i_b = nullptr; // legacy naming
+    struct ggml_tensor * ff_o_w = nullptr; // legacy naming
+    struct ggml_tensor * ff_o_b = nullptr; // legacy naming
+
+    struct ggml_tensor * ff_up_w = nullptr;
+    struct ggml_tensor * ff_up_b = nullptr;
+    struct ggml_tensor * ff_gate_w = nullptr;
+    struct ggml_tensor * ff_gate_b = nullptr;
+    struct ggml_tensor * ff_down_w = nullptr;
+    struct ggml_tensor * ff_down_b = nullptr;
 
     // layernorm 2
     struct ggml_tensor * ln_2_w = nullptr;
@@ -297,6 +304,9 @@ struct clip_vision_model {
     // gemma3
     struct ggml_tensor * mm_input_proj_w = nullptr;
     struct ggml_tensor * mm_soft_emb_norm_w = nullptr;
+
+    // pixtral
+    struct ggml_tensor * token_embd_img_break = nullptr;
 };
 
 struct clip_ctx {
@@ -329,6 +339,7 @@ struct clip_ctx {
     ggml_backend_t backend_cpu;
     ggml_backend_buffer_ptr buf;
 
+    int max_nodes = 8192;
     ggml_backend_sched_ptr sched;
 
     clip_image_size load_image_size;
@@ -544,6 +555,218 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
     return gf;
 }
 
+// implementation of the 2D RoPE without adding a new op in ggml
+static ggml_tensor * build_rope_2d(
+    ggml_cgraph * gf,
+    ggml_context * ctx0,
+    ggml_tensor * cur,
+    ggml_tensor * pos_h,
+    ggml_tensor * pos_w,
+    const float freq_base
+) {
+    ggml_tensor * tmp;
+    const int64_t n_dim  = cur->ne[0];
+    const int64_t n_head = cur->ne[1];
+    const int64_t n_pos  = cur->ne[2];
+
+    // for example, if we have cur tensor of shape (n_dim=8, n_head, n_pos)
+    // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
+    // first half of cur will use 1e-0, 1e-2 (even)
+    // second half of cur will use 1e-1, 1e-3 (odd)
+    //
+    // for the first half, the trick here is to rotate n_dim/2, so inv_freq will be even
+    //  ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
+    // then for the second half, we use freq_scale to shift the inv_freq
+    //  ^ why? replace (2i) with (2i+1) in the above equation
+    const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
+
+    // first half
+    {
+        cur = ggml_rope_ext_inplace(
+            ctx0,
+            cur,
+            pos_h,      // positions
+            nullptr,    // freq factors
+            n_dim/2,    // n_dims
+            0, 0, freq_base,
+            1.0f, 0.0f, 1.0f, 0.0f, 0.0f
+        );
+    }
+
+    // second half
+    {
+        tmp = ggml_view_3d(ctx0, cur,
+            n_dim/2, n_head, n_pos,
+            ggml_row_size(cur->type, n_dim),
+            ggml_row_size(cur->type, n_dim*n_head),
+            n_dim/2 * ggml_element_size(cur));
+        tmp = ggml_rope_ext_inplace(
+            ctx0,
+            tmp,
+            pos_w,      // positions
+            nullptr,    // freq factors
+            n_dim/2,    // n_dims
+            0, 0, freq_base,
+            freq_scale_odd,
+            0.0f, 1.0f, 0.0f, 0.0f
+        );
+        // calculate inplace (modify cur directly)
+        ggml_build_forward_expand(gf, tmp);
+    }
+
+    return cur;
+}
+
+static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
+    const auto & model = ctx->vision_model;
+    const auto & hparams = model.hparams;
+
+    GGML_ASSERT(ctx->proj_type == PROJECTOR_TYPE_PIXTRAL);
+    GGML_ASSERT(imgs.entries.size() == 1); // batch_size == 1
+
+    int image_size_width  = imgs.entries[0]->nx;
+    int image_size_height = imgs.entries[0]->ny;
+
+    const int patch_size  = hparams.patch_size;
+    const int n_patches_x = image_size_width  / patch_size;
+    const int n_patches_y = image_size_height / patch_size;
+    const int num_patches = n_patches_x * n_patches_y;
+    const int hidden_size = hparams.hidden_size;
+    const int n_head      = hparams.n_head;
+    const int d_head      = hidden_size / n_head;
+    const int n_layer     = hparams.n_layer;
+    const float eps       = hparams.eps;
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
+        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
+        /*.no_alloc   =*/ true,
+    };
+
+    ggml_context_ptr ctx0_ptr(ggml_init(params));
+    auto ctx0 = ctx0_ptr.get();
+
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    // input raw
+    struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3);
+    ggml_set_name(inp_raw, "inp_raw");
+    ggml_set_input(inp_raw);
+
+    // 2D input positions
+    struct ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
+    ggml_set_name(pos_h, "pos_h");
+    ggml_set_input(pos_h);
+    struct ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_patches);
+    ggml_set_name(pos_w, "pos_w");
+    ggml_set_input(pos_w);
+
+    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+    inp = ggml_reshape_2d(ctx0, inp, num_patches, hidden_size);
+    inp = ggml_cont(ctx0, ggml_transpose(ctx0, inp));
+
+    struct ggml_tensor * embeddings = inp;
+
+    // pre-layer norm
+    embeddings = ggml_mul(ctx0, ggml_rms_norm(ctx0, embeddings, eps), model.pre_ln_w);
+
+    // loop over layers
+    for (int il = 0; il < n_layer; il++) {
+        struct ggml_tensor * cur = embeddings;
+
+        // pre-attention norm
+        cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_1_w);
+
+        // self-attention
+        {
+            struct ggml_tensor * Q = ggml_mul_mat(ctx0, model.layers[il].q_w, cur);
+
+            Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
+            Q = build_rope_2d(gf, ctx0, Q, pos_h, pos_w, hparams.rope_theta);
+            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
+
+            struct ggml_tensor * K = ggml_mul_mat(ctx0, model.layers[il].k_w, cur);
+
+            K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
+            K = build_rope_2d(gf, ctx0, K, pos_h, pos_w, hparams.rope_theta);
+            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
+
+            struct ggml_tensor * V = ggml_mul_mat(ctx0, model.layers[il].v_w, cur);
+
+            V = ggml_reshape_3d(ctx0, V, d_head, n_head, num_patches);
+            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
+
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
+
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
+            KQV = ggml_reshape_3d(ctx0, KQV, d_head, num_patches, n_head);
+            KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            cur = ggml_cont_2d(ctx0, KQV, hidden_size, num_patches);
+
+            cur = ggml_mul_mat(ctx0, model.layers[il].o_w, cur);
+        }
+
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, embeddings);
+
+        embeddings = cur; // embeddings = residual, cur = hidden_states
+
+        // pre-ffn norm
+        cur = ggml_mul(ctx0, ggml_rms_norm(ctx0, cur, eps), model.layers[il].ln_2_w);
+
+        // feed-forward
+        {
+            ggml_tensor * gate_proj = ggml_mul_mat(ctx0, model.layers[il].ff_gate_w, cur);
+            ggml_tensor * up_proj   = ggml_mul_mat(ctx0, model.layers[il].ff_up_w,   cur);
+            gate_proj = ggml_silu(ctx0, gate_proj); // pixtral uses silu
+            cur = ggml_mul(ctx0, up_proj, gate_proj);
+            cur = ggml_mul_mat(ctx0, model.layers[il].ff_down_w, cur);
+        }
+
+        // residual 2
+        cur = ggml_add(ctx0, embeddings, cur);
+
+        embeddings = cur;
+    }
+
+    // LlavaMultiModalProjector (with GELU activation)
+    {
+        embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
+        embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
+
+        embeddings = ggml_gelu(ctx0, embeddings);
+        embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
+        embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
+    }
+
+    // arrangement of the [IMG_BREAK] token
+    {
+        // not efficient, but works
+        // the trick is to view the embeddings as a 3D tensor with shape [hidden_size, n_patches_per_row, n_rows]
+        // and then concatenate the [IMG_BREAK] token to the end of each row, aka n_patches_per_row dimension
+        // after the concatenation, we have a tensor with shape [hidden_size, n_patches_per_row + 1, n_rows]
+
+        const int n_embd_text     = embeddings->ne[0];
+        const int n_tokens_output = num_patches + n_patches_y - 1; // one [IMG_BREAK] per row, except the last row
+
+        ggml_tensor * cur = ggml_reshape_3d(ctx0, embeddings, n_embd_text, n_patches_x, n_patches_y);
+        ggml_tensor * tok = ggml_new_tensor_3d(ctx0, embeddings->type, n_embd_text, 1, n_patches_y);
+        tok = ggml_scale(ctx0, tok, 0.0); // clear the tensor
+        tok = ggml_add(ctx0, tok, model.token_embd_img_break);
+        cur = ggml_concat(ctx0, cur, tok, 1);
+        embeddings = ggml_view_2d(ctx0, cur,
+            n_embd_text, n_tokens_output,
+            ggml_row_size(cur->type, n_embd_text), 0);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    return gf;
+}
+
 static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
     if (!ctx->has_vision_encoder) {
         LOG_ERR("This gguf file seems to have no vision encoder\n");
@@ -1118,6 +1341,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 res = clip_image_build_graph_siglip(ctx, imgs);
             } break;
+        case PROJECTOR_TYPE_PIXTRAL:
+            {
+                res = clip_image_build_graph_pixtral(ctx, imgs);
+            } break;
         default:
             {
                 // TODO: we should have one build_* function per model
@@ -1279,6 +1506,10 @@ struct clip_model_loader {
                 {
                     get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
                 } break;
+            case PROJECTOR_TYPE_PIXTRAL:
+                {
+                    hparams.rope_theta = 10000.0f;
+                } break;
             default:
                 break;
         }
@@ -1350,16 +1581,26 @@ struct clip_model_loader {
             layer.o_w    = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "weight"));
             layer.ln_1_w = get_tensor(string_format(TN_LN_1,        "v", il, "weight"), false);
             layer.ln_2_w = get_tensor(string_format(TN_LN_2,        "v", il, "weight"), false);
-            layer.ff_i_w = get_tensor(string_format(TN_FFN_DOWN,    "v", il, "weight"));
-            layer.ff_o_w = get_tensor(string_format(TN_FFN_UP,      "v", il, "weight"));
             layer.k_b    = get_tensor(string_format(TN_ATTN_K,      "v", il, "bias"), false);
             layer.q_b    = get_tensor(string_format(TN_ATTN_Q,      "v", il, "bias"), false);
             layer.v_b    = get_tensor(string_format(TN_ATTN_V,      "v", il, "bias"), false);
             layer.o_b    = get_tensor(string_format(TN_ATTN_OUTPUT, "v", il, "bias"), false);
             layer.ln_1_b = get_tensor(string_format(TN_LN_1,        "v", il, "bias"), false);
             layer.ln_2_b = get_tensor(string_format(TN_LN_2,        "v", il, "bias"), false);
-            layer.ff_i_b = get_tensor(string_format(TN_FFN_DOWN,    "v", il, "bias"), false);
-            layer.ff_o_b = get_tensor(string_format(TN_FFN_UP,      "v", il, "bias"), false);
+
+            // new naming
+            layer.ff_up_w   = get_tensor(string_format(TN_FFN_UP,   "v", il, "weight"));
+            layer.ff_up_b   = get_tensor(string_format(TN_FFN_UP,   "v", il, "bias"),   false);
+            layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, "v", il, "weight"), false);
+            layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, "v", il, "bias"),   false);
+            layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, "v", il, "weight"));
+            layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, "v", il, "bias"),   false);
+
+            // legacy naming (the in and out is reversed! don't ask me why)
+            layer.ff_i_w = layer.ff_down_w;
+            layer.ff_o_w = layer.ff_up_w;
+            layer.ff_i_b = layer.ff_down_b;
+            layer.ff_o_b = layer.ff_up_b;
         }
 
         switch (ctx_clip.proj_type) {
@@ -1475,6 +1716,15 @@ struct clip_model_loader {
                 {
                     vision_model.projection = get_tensor(TN_MM_PROJECTOR);
                 } break;
+            case PROJECTOR_TYPE_PIXTRAL:
+                {
+                    vision_model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
+                    vision_model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
+                    vision_model.mm_2_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+                    vision_model.mm_2_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+                    // [IMG_BREAK] token embedding
+                    vision_model.token_embd_img_break = get_tensor(TN_TOK_IMG_BREAK);
+                } break;
             default:
                 GGML_ASSERT(false && "unknown projector type");
         }
@@ -1517,18 +1767,17 @@ struct clip_model_loader {
     }
 
     void alloc_compute_meta() {
-        ctx_clip.buf_compute_meta.resize(GGML_DEFAULT_GRAPH_SIZE * ggml_tensor_overhead() + ggml_graph_overhead());
+        ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
 
         // create a fake batch
         clip_image_f32_batch batch;
         clip_image_f32_ptr img(clip_image_f32_init());
         clip_image_size image_size;
-        image_size.width  = clip_get_image_size(&ctx_clip);
-        image_size.height = clip_get_image_size(&ctx_clip);
-        int n_patches = clip_get_image_size(&ctx_clip) / image_size.width;
-        img->nx = n_patches;
-        img->ny = n_patches;
-        img->buf.resize(n_patches * image_size.width * image_size.height * 3);
+        image_size.width  = ctx_clip.vision_model.hparams.image_size;
+        image_size.height = ctx_clip.vision_model.hparams.image_size;
+        img->nx = image_size.width;
+        img->ny = image_size.height;
+        img->buf.resize(image_size.width * image_size.height * 3);
         batch.entries.push_back(std::move(img));
 
         ggml_cgraph * gf = clip_image_build_graph(&ctx_clip, batch, image_size, false);
@@ -1916,6 +2165,26 @@ struct image_manipulation {
         }
     }
 
+    // calculate the size of the **resized** image, while preserving the aspect ratio
+    // the calculated size will be aligned to the nearest multiple of align_size
+    // if H or W size is larger than max_dimension, it will be resized to max_dimension
+    static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int max_dimension) {
+        if (inp_size.width <= 0 || inp_size.height <= 0 || align_size <= 0 || max_dimension <= 0) {
+            return {0, 0};
+        }
+
+        float scale = std::min(1.0f, std::min(static_cast<float>(max_dimension) / inp_size.width,
+                                              static_cast<float>(max_dimension) / inp_size.height));
+
+        float target_width_f  = static_cast<float>(inp_size.width)  * scale;
+        float target_height_f = static_cast<float>(inp_size.height) * scale;
+
+        int aligned_width  = GGML_PAD((int)target_width_f,  align_size);
+        int aligned_height = GGML_PAD((int)target_height_f, align_size);
+
+        return {aligned_width, aligned_height};
+    }
+
 private:
     static inline int clip(int x, int lower, int upper) {
         return std::max(lower, std::min(x, upper));
@@ -2247,8 +2516,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         res_imgs->entries.push_back(std::move(img_f32));
         return true;
     }
-
-    if (ctx->has_glm_projector
+    else if (ctx->has_glm_projector
             || ctx->proj_type == PROJECTOR_TYPE_GEMMA3
             || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
         clip_image_u8 resized_image;
@@ -2260,6 +2528,15 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         res_imgs->entries.push_back(std::move(img_f32));
         return true;
     }
+    else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
+        clip_image_u8 resized_image;
+        auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
+        image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
+        clip_image_f32_ptr img_f32(clip_image_f32_init());
+        normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
+        res_imgs->entries.push_back(std::move(img_f32));
+        return true;
+    }
 
     // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
     // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
@@ -2387,6 +2664,10 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
         n_patches = 256;
     } else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
         n_patches /= ctx->vision_model.hparams.proj_scale_factor;
+    } else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
+        int n_patches_x = img->nx / params.patch_size;
+        int n_patches_y = img->ny / params.patch_size;
+        n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
     }
 
     return n_patches;
@@ -2540,10 +2821,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
         float * data = (float *)malloc(ggml_nbytes(inp_raw));
 
+        // TODO @ngxson : this whole code block is ugly, will need to be refactored
         for (size_t i = 0; i < imgs.entries.size(); i++) {
             const int nx = imgs.entries[i]->nx;
             const int ny = imgs.entries[i]->ny;
-            if (!(ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger)) {
+
+            if (ctx->has_glm_projector
+                    || ctx->has_llava_projector
+                    || ctx->proj_type == PROJECTOR_TYPE_GEMMA3
+                    || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
                 GGML_ASSERT(nx == image_size && ny == image_size);
             }
 
@@ -2657,6 +2943,24 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
             // do nothing
         }
+        else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
+            // set the 2D positions
+            int n_patches_per_col = image_size_width / patch_size;
+            std::vector<int> pos_data(num_positions);
+            struct ggml_tensor * pos;
+            // dimension H
+            pos = ggml_graph_get_tensor(gf, "pos_h");
+            for (int i = 0; i < num_positions; i++) {
+                pos_data[i] = i / n_patches_per_col;
+            }
+            ggml_backend_tensor_set(pos, pos_data.data(), 0, ggml_nbytes(pos));
+            // dimension W
+            pos = ggml_graph_get_tensor(gf, "pos_w");
+            for (int i = 0; i < num_positions; i++) {
+                pos_data[i] = i % n_patches_per_col;
+            }
+            ggml_backend_tensor_set(pos, pos_data.data(), 0, ggml_nbytes(pos));
+        }
         else {
             struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
 
@@ -2849,6 +3153,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
         case PROJECTOR_TYPE_LDPV2:
             return ctx->vision_model.mm_model_peg_0_b->ne[0];
         case PROJECTOR_TYPE_MLP:
+        case PROJECTOR_TYPE_PIXTRAL:
             return ctx->vision_model.mm_2_b->ne[0];
         case PROJECTOR_TYPE_MLP_NORM:
             return ctx->vision_model.mm_3_b->ne[0];
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index c3fb2f18a2b02..11ca7b30f1ac6 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -190,6 +190,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
         // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
         marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+
+    } else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
+        // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
+        marker_modified = ctx->image_marker + "[IMG_END]";
+        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
     }
 
     // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
@@ -219,7 +224,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
 
         for (auto & entry : batch_f32.entries) {
             mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-            image_tokens->nx = clip_n_patches(ctx->ctx_clip);
+            image_tokens->nx = clip_n_patches_by_img(ctx->ctx_clip, entry.get());
             image_tokens->ny = 1;
             image_tokens->batch_f32.entries.push_back(std::move(entry));
             image_tokens->id = id;
@@ -313,8 +318,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                 }
 
             } else {
+                size_t n_tokens = 0;
+                for (const auto & entry : batch_f32.entries) {
+                    n_tokens += clip_n_patches_by_img(ctx->ctx_clip, entry.get());
+                }
+
                 mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-                image_tokens->nx = clip_n_patches(ctx->ctx_clip) * batch_f32.entries.size(); // TODO @ngxson : use clip_n_patches_by_image
+                image_tokens->nx = n_tokens;
                 image_tokens->ny = 1; // TODO
                 image_tokens->batch_f32 = std::move(batch_f32);
                 image_tokens->id = bitmaps[i_img].id; // optional
@@ -382,7 +392,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
         const auto & entries = image_tokens->batch_f32.entries;
         for (size_t i = 0; i < entries.size(); i++) {
-            int n_tokens_per_image = clip_n_patches(ctx->ctx_clip);
+            int n_tokens_per_image = clip_n_patches_by_img(ctx->ctx_clip, entries[i].get());
             ok = clip_image_encode(
                 ctx->ctx_clip,
                 ctx->n_threads,
diff --git a/examples/llava/tests.sh b/examples/llava/tests.sh
index 8752fc267a8ac..e612857edc55d 100755
--- a/examples/llava/tests.sh
+++ b/examples/llava/tests.sh
@@ -13,6 +13,14 @@ mkdir -p $SCRIPT_DIR/output
 PROJ_ROOT="$SCRIPT_DIR/../.."
 cd $PROJ_ROOT
 
+# Check if the first argument is "big", then run test with big models
+# This is useful if we're running the script on a larger machine, so we can test the big models
+RUN_BIG_TESTS=false
+if [ "${1:-}" = "big" ]; then
+    RUN_BIG_TESTS=true
+    echo "Include BIG models..."
+fi
+
 ###############
 
 arr_bin=()
@@ -28,6 +36,12 @@ add_test() {
     arr_tmpl+=("$tmpl")
 }
 
+add_test_big() {
+    if [ "$RUN_BIG_TESTS" = true ]; then
+        add_test "$@"
+    fi
+}
+
 add_test "llama-mtmd-cli"  "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
 add_test "llama-mtmd-cli"  "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
 add_test "llama-mtmd-cli"  "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
@@ -42,6 +56,9 @@ add_test "llama-mtmd-cli"  "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
 add_test "llama-mtmd-cli"  "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
 add_test "llama-qwen2vl-cli"  "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
 
+# to test the big models, run: ./tests.sh big
+add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"
+
 # these models always give the wrong answer, not sure why
 # add_test "llama-mtmd-cli"  "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
 # add_test "llama-mtmd-cli"  "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0"
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 59510bd0c2a2a..b81017b142583 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -485,6 +485,7 @@ class MODEL_TENSOR(IntEnum):
     V_ENC_OUTPUT         = auto()
     V_ENC_OUTPUT_NORM    = auto()
     V_ENC_FFN_UP         = auto()
+    V_ENC_FFN_GATE       = auto()
     V_ENC_FFN_DOWN       = auto()
     V_PRE_NORM           = auto()
     V_POST_NORM          = auto()
@@ -501,6 +502,7 @@ class MODEL_TENSOR(IntEnum):
     V_RESMPL_Q_NORM      = auto() # minicpmv
     V_RESMPL_PROJ        = auto() # minicpmv
     V_RESMPL_QUERY       = auto() # minicpmv
+    V_TOK_EMBD_IMG_BREAK = auto() # pixtral
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -737,6 +739,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_ENC_OUTPUT:              "v.blk.{bid}.attn_out",
     MODEL_TENSOR.V_ENC_OUTPUT_NORM:         "v.blk.{bid}.ln2",
     MODEL_TENSOR.V_ENC_FFN_UP:              "v.blk.{bid}.ffn_up",
+    MODEL_TENSOR.V_ENC_FFN_GATE:            "v.blk.{bid}.ffn_gate",
     MODEL_TENSOR.V_ENC_FFN_DOWN:            "v.blk.{bid}.ffn_down",
     MODEL_TENSOR.V_PRE_NORM:                "v.pre_ln",
     MODEL_TENSOR.V_POST_NORM:               "v.post_ln",
@@ -753,6 +756,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.V_RESMPL_Q_NORM:           "resampler.ln_q",
     MODEL_TENSOR.V_RESMPL_PROJ:             "resampler.proj",
     MODEL_TENSOR.V_RESMPL_QUERY:            "resampler.query",
+    MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK:      "v.token_embd.img_break", # pixtral
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -771,6 +775,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.V_ENC_OUTPUT,
         MODEL_TENSOR.V_ENC_OUTPUT_NORM,
         MODEL_TENSOR.V_ENC_FFN_UP,
+        MODEL_TENSOR.V_ENC_FFN_GATE,
         MODEL_TENSOR.V_ENC_FFN_DOWN,
         MODEL_TENSOR.V_PRE_NORM,
         MODEL_TENSOR.V_POST_NORM,
@@ -787,6 +792,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.V_RESMPL_Q_NORM,
         MODEL_TENSOR.V_RESMPL_PROJ,
         MODEL_TENSOR.V_RESMPL_QUERY,
+        MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
     ],
     MODEL_ARCH.LLAMA: [
         MODEL_TENSOR.TOKEN_EMBD,
@@ -2129,6 +2135,7 @@ def get_type(val: Any) -> GGUFValueType:
 class VisionProjectorType:
     GEMMA3 = "gemma3"
     IDEFICS3 = "idefics3"
+    PIXTRAL = "pixtral"
 
 
 # Items here are (block size, type size)
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 3ff378c136645..1d70551973b01 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -914,6 +914,7 @@ class TensorNameMap:
             "vision_tower.vision_model.embeddings.patch_embedding",
             "vpm.embeddings.patch_embedding",
             "model.vision_model.embeddings.patch_embedding", # SmolVLM
+            "vision_tower.patch_conv", # pixtral
         ),
 
         MODEL_TENSOR.V_ENC_EMBD_POS: (
@@ -926,52 +927,65 @@ class TensorNameMap:
             "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj",
             "vpm.encoder.layers.{bid}.self_attn.q_proj",
             "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
+            "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_K: (
             "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj",
             "vpm.encoder.layers.{bid}.self_attn.k_proj",
             "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
+            "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral
         ),
 
         MODEL_TENSOR.V_ENC_ATTN_V: (
             "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj",
             "vpm.encoder.layers.{bid}.self_attn.v_proj",
             "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
+            "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral
         ),
 
         MODEL_TENSOR.V_ENC_INPUT_NORM: (
             "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1",
             "vpm.encoder.layers.{bid}.layer_norm1",
             "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
+            "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
         ),
 
         MODEL_TENSOR.V_ENC_OUTPUT: (
             "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj",
             "vpm.encoder.layers.{bid}.self_attn.out_proj",
             "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
+            "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral
         ),
 
         MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
             "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2",
             "vpm.encoder.layers.{bid}.layer_norm2",
             "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
+            "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral
         ),
 
         MODEL_TENSOR.V_ENC_FFN_UP: (
             "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
             "vpm.encoder.layers.{bid}.mlp.fc1",
             "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 (note: name is swapped)
+            "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral
+        ),
+
+        MODEL_TENSOR.V_ENC_FFN_GATE: (
+            "vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral
         ),
 
         MODEL_TENSOR.V_ENC_FFN_DOWN: (
             "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
             "vpm.encoder.layers.{bid}.mlp.fc2",
             "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 (note: name is swapped)
+            "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral
         ),
 
         MODEL_TENSOR.V_PRE_NORM: (
             "vision_tower.vision_model.pre_layrnorm",
+            "vision_tower.ln_pre", # pixtral
         ),
 
         MODEL_TENSOR.V_POST_NORM: (
@@ -1030,6 +1044,10 @@ class TensorNameMap:
         MODEL_TENSOR.V_RESMPL_QUERY: (
             "resampler.query",
         ),
+
+        MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: (
+            "v.token_embd.img_break", # for pixtral, this is a generated vector
+        ),
     }
 
     # architecture-specific block mappings
diff --git a/include/llama.h b/include/llama.h
index 5657fbf0a703a..a13350e15be6a 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -111,6 +111,7 @@ extern "C" {
         LLAMA_VOCAB_PRE_TYPE_TRILLION       = 31,
         LLAMA_VOCAB_PRE_TYPE_BAILINGMOE     = 32,
         LLAMA_VOCAB_PRE_TYPE_LLAMA4         = 33,
+        LLAMA_VOCAB_PRE_TYPE_PIXTRAL        = 34,
     };
 
     enum llama_rope_type {
diff --git a/models/ggml-vocab-pixtral.gguf.inp b/models/ggml-vocab-pixtral.gguf.inp
new file mode 100644
index 0000000000000..9baf7d77ae6b5
--- /dev/null
+++ b/models/ggml-vocab-pixtral.gguf.inp
@@ -0,0 +1,112 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+	
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+	
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+!!!!!!
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+Cửa Việt
+__ggml_vocab_test__
+ discards
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+ 	 		 	
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-pixtral.gguf.out b/models/ggml-vocab-pixtral.gguf.out
new file mode 100644
index 0000000000000..53309d1bc9ac7
--- /dev/null
+++ b/models/ggml-vocab-pixtral.gguf.out
@@ -0,0 +1,46 @@
+ 2014 1032 1052 1032 28504 6972
+ 1070 7088 1258
+
+ 1032
+ 1256
+ 1293
+ 1009
+ 1010
+ 1267
+ 4688
+ 1009 1010
+ 22177 4304
+ 45383 4304
+ 22177 5325
+ 45383 5325
+ 45383 5325 1033
+ 22177 1044 4304 1033
+ 45383 1044 4304 1033
+ 1593 1395 119685 1166 1153 1046 51228
+ 1119 1048 1052 1056 1032 1055 17391 23216 30203 7785 17279
+ 3337 30757 1902 4200 63073 3671
+ 1225 1158 1128 1225 1158 1182 1225 1158 1147 1225 1159 1139 1225 1158 1143 1225 1159 1130 1225 1158 1150 1225 1158 1183 1225 1158 1159 1225 21359 1225 1158 1159 1225 1158 1162 1225 1158 1182 1225 1158 1133 1225 1158 1129 1225 1158 1155 1225 1158 1133 1225 21359 1225 1158 1137
+ 1240 1159 1154 1128 1319 13052 1041 119685 1152 1182 29568 1240 1159 1140 1171 1239 1184 1143 1319 88181 1873 3659 1275 56421 1621 1041 126241 1133 1319 11234 1873 26303 1455 1934 2246 3754 10835 1041
+ 22177
+ 45383
+ 1032 45383
+ 1256 45383
+ 1293 45383
+ 1293 45383 1010 1293 45383
+ 1319
+ 1010 1376
+ 1039 4033
+ 22177 1044 1404 48054 1033 3075 1584 1636 119685 1152 1129 3082 26060 2998 63614 82278 1049 1051 1049 1052 1049 1053 1049 6434 6749
+ 7290 7290 7290
+ 1051
+ 1051 1051
+ 1051 1051 1051
+ 1051 1051 1051 1051
+ 1051 1051 1051 1051 1051
+ 1051 1051 1051 1051 1051 1051
+ 1051 1051 1051 1051 1051 1051 1051
+ 1051 1051 1051 1051 1051 1051 1051 1051
+ 1051 1051 1051 1051 1051 1051 1051 1051 1051
+ 1067 59503 28783
+ 3724 4058
+ 1010 1032 1267 1032 4688 1032 17152 1458 29356 1010 1256 1010 1293 1010 1260 1010 1652 1010 1240 1159 1154 1128 1319 13052 1041 119685 1152 1182 29568 1240 1159 1140 1171 1239 1184 1143 1319 88181 1873 3659 1275 56421 1621 1041 126241 1133 119685 1166 1153 1240 1159 1166 1153 1032 1051 1032 1051 1051 1032 1051 1051 1051 1032 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1051 1032 1051 1051 1051 1051 1051 1051 1051 1051 1032 1051 1046 1051 1032 1051 1791 1051 1032 1051 2880 1051 71881 1158 1128 1225 1158 1182 1225 1158 1147 1225 1159 1139 1225 1158 1143 1225 1159 1130 1225 1158 1150 1225 1158 1183 1225 1158 1159 1225 21359 1225 1158 1159 1225 1158 1162 1225 1158 1182 1225 1158 1133 1240 1159 1152 1129 3082 26060 2998 63614 82278 1049 1051 1049 1052 1049 1053 1049 6434 6749 45577 1045 6626 43555 2843 30757 1902 4200 63073 3671 14931 20040 20040 1657 1657 1975 14135 14135 83923 7290 7290 7290 45509 45509 45509 1362 6483 2151 1576 1116 2189 1514 1681 2156 1044 1576 3609 1636 5257 1063 1576 1077 1605 5257 1362 7534 3180 1494 1044 1576 1068 1636 2479 2269 26883 1063 2837 1039 45654 1261 54297 1076
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index 480605173dd91..50ded286f3f5f 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1506,7 +1506,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                     tokenizer_pre == "llama3"   ||
                     tokenizer_pre == "llama-v3" ||
                     tokenizer_pre == "llama-bpe"||
-                    tokenizer_pre == "falcon3") {
+                    tokenizer_pre == "falcon3"  ||
+                    tokenizer_pre == "pixtral") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
                 ignore_merges = true;
                 add_bos = true;

From 56304069599f4dd9749d94b9bca2c2c65bb27c02 Mon Sep 17 00:00:00 2001
From: pl752 <pl752@mail.ru>
Date: Thu, 24 Apr 2025 02:32:35 +0500
Subject: [PATCH 016/200] llama-mtmd-cli: Sigint rework in mtmd vision example
 (#13080)

* Sigint rework in mtmd vision example

* Applied suggestions on mtmd-cli PR

* Forgot to invert one of the conditions

* Update examples/llava/mtmd-cli.cpp

* Removed redundant exit check

---------

Co-authored-by: pl752 <maximpl752@gmail.com>
Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
---
 examples/llava/mtmd-cli.cpp | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp
index e80845a2c5469..89af7331a1658 100644
--- a/examples/llava/mtmd-cli.cpp
+++ b/examples/llava/mtmd-cli.cpp
@@ -24,7 +24,9 @@
 #include <signal.h>
 #endif
 
-static bool g_is_generating = false;
+// volatile, because of signal being an interrupt
+static volatile bool g_is_generating = false;
+static volatile bool g_is_interrupted = false;
 
 /**
  * Please note that this is NOT a production-ready stuff.
@@ -50,8 +52,10 @@ static void sigint_handler(int signo) {
             g_is_generating = false;
         } else {
             console::cleanup();
-            LOG("\nInterrupted by user\n");
-            _exit(130);
+            if (g_is_interrupted) {
+                _exit(1);
+            }
+            g_is_interrupted = true;
         }
     }
 }
@@ -167,7 +171,7 @@ struct decode_embd_batch {
 static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
     llama_tokens generated_tokens;
     for (int i = 0; i < n_predict; i++) {
-        if (i > n_predict || !g_is_generating) {
+        if (i > n_predict || !g_is_generating || g_is_interrupted) {
             printf("\n");
             break;
         }
@@ -184,6 +188,11 @@ static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int
         printf("%s", common_token_to_piece(ctx.lctx, token_id).c_str());
         fflush(stdout);
 
+        if (g_is_interrupted) {
+            printf("\n");
+            break;
+        }
+
         // eval the token
         common_batch_clear(ctx.batch);
         common_batch_add(ctx.batch, token_id, ctx.n_past++, {0}, true);
@@ -219,6 +228,9 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
     text.add_special   = add_bos;
     text.parse_special = true;
     mtmd_input_chunks chunks;
+
+    if (g_is_interrupted) return 0;
+
     int32_t res = mtmd_tokenize(ctx.ctx_vision.get(), chunks, text, bitmaps);
     if (res != 0) {
         LOG_ERR("Unable to tokenize prompt, res = %d\n", res);
@@ -276,6 +288,8 @@ int main(int argc, char ** argv) {
 #endif
     }
 
+    if (g_is_interrupted) return 130;
+
     if (is_single_turn) {
         g_is_generating = true;
         if (params.prompt.find("<__image__>") == std::string::npos) {
@@ -287,7 +301,7 @@ int main(int argc, char ** argv) {
         if (eval_message(ctx, msg, params.image, true)) {
             return 1;
         }
-        if (generate_response(ctx, smpl, n_predict)) {
+        if (!g_is_interrupted && generate_response(ctx, smpl, n_predict)) {
             return 1;
         }
 
@@ -302,12 +316,13 @@ int main(int argc, char ** argv) {
         std::vector<std::string> images_fname;
         std::string content;
 
-        while (true) {
+        while (!g_is_interrupted) {
             g_is_generating = false;
             LOG("\n> ");
             console::set_display(console::user_input);
             std::string line;
             console::readline(line, false);
+            if (g_is_interrupted) break;
             console::set_display(console::reset);
             line = string_strip(line);
             if (line.empty()) {
@@ -335,6 +350,7 @@ int main(int argc, char ** argv) {
             msg.role = "user";
             msg.content = content;
             int ret = eval_message(ctx, msg, images_fname, is_first_msg);
+            if (g_is_interrupted) break;
             if (ret == 2) {
                 // non-fatal error
                 images_fname.clear();
@@ -352,6 +368,7 @@ int main(int argc, char ** argv) {
             is_first_msg = false;
         }
     }
+    if (g_is_interrupted) LOG("\nInterrupted by user\n");
     llama_perf_context_print(ctx.lctx);
-    return 0;
+    return g_is_interrupted ? 130 : 0;
 }

From b3b6d862cfdf190e1b9ad961639a25f5ebc0c7e3 Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Thu, 24 Apr 2025 07:18:33 +0000
Subject: [PATCH 017/200] vulkan: matmul gcn tuning (#13016)

* tune matmul for gcn

* this one is more power efficient

* Update ggml/src/ggml-vulkan/ggml-vulkan.cpp

Co-authored-by: 0cc4m <picard12@live.de>

* disable this tune for the proprietary driver

---------

Co-authored-by: 0cc4m <picard12@live.de>
---
 ggml/src/ggml-vulkan/ggml-vulkan.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
index 39f3cd343ac45..c0bdb9e17a7b4 100644
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -246,6 +246,7 @@ struct vk_device_struct {
     bool pipeline_robustness;
     vk::Device device;
     uint32_t vendor_id;
+    vk::DriverId driver_id;
     vk_device_architecture architecture;
     vk_queue compute_queue;
     vk_queue transfer_queue;
@@ -1740,6 +1741,11 @@ static void ggml_vk_load_shaders(vk_device& device) {
         m_warptile_mmq_int = { 128,  64,  64, 32, subgroup_size_8,     32, 2, 2, 2, 1, subgroup_size_8 };
         s_warptile_mmq_int = { subgroup_size_32, 32, 32, 32, 32,       32, 2, 2, 1, 1, subgroup_size_8 };
 
+        // chip specific tuning
+        if ((device->architecture == AMD_GCN) && (device->driver_id != vk::DriverId::eAmdProprietary)) {
+            m_warptile_mmq = m_warptile_mmq_int = { 256, 64, 64, 32, 16, 16, 2, 2, 2, 1, 16 };
+        }
+
         l_mmq_wg_denoms = l_wg_denoms = {128, 128, 1 };
         m_mmq_wg_denoms = m_wg_denoms = { 64,  64, 1 };
         s_mmq_wg_denoms = s_wg_denoms = { 32,  32, 1 };
@@ -2658,6 +2664,7 @@ static vk_device ggml_vk_get_device(size_t idx) {
         device->physical_device.getProperties2(&props2);
         device->properties = props2.properties;
         device->vendor_id = device->properties.vendorID;
+        device->driver_id = driver_props.driverID;
 
         const char* GGML_VK_FORCE_MAX_ALLOCATION_SIZE = getenv("GGML_VK_FORCE_MAX_ALLOCATION_SIZE");
 

From 7604a7d6b80e78eef8f275fc700d0f64820d672f Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 24 Apr 2025 10:38:30 +0300
Subject: [PATCH 018/200] metal : fix floating-point range of attention scores
 in FA kernels (#13090)

ggml-ci
---
 ggml/src/ggml-metal/ggml-metal.metal | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal
index 8d6e99e621e9e..9f4147e93974d 100644
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -3192,7 +3192,7 @@ kernel void kernel_flash_attn_ext(
 
     {
         float S[Q] = { [0 ... Q-1] = 0.0f };
-        float M[Q] = { [0 ... Q-1] = -__FLT16_MAX__/2 };
+        float M[Q] = { [0 ... Q-1] = -__FLT_MAX__/2 };
 
         // thread indices inside the simdgroup
         // TODO: see if we can utilize quad-group functions for better performance
@@ -3452,7 +3452,7 @@ kernel void kernel_flash_attn_ext(
     // reduce the warps sequentially
     for (ushort sg = 1; sg < nsg; ++sg) {
         float S = { 0.0f };
-        float M = { -__FLT16_MAX__/2 };
+        float M = { -__FLT_MAX__/2 };
 
         threadgroup_barrier(mem_flags::mem_threadgroup);
 
@@ -3699,7 +3699,7 @@ kernel void kernel_flash_attn_ext_vec(
 
     {
         float S = 0.0f;
-        float M = -__FLT16_MAX__/2;
+        float M = -__FLT_MAX__/2;
 
         // thread indices inside the simdgroup
         const short tx = tiisg%NL;

From 80982e815e67bae2442237f4e11466f44c9a2988 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Thu, 24 Apr 2025 12:14:13 +0200
Subject: [PATCH 019/200] arg : clean up handling --mmproj with -hf (#13082)

* arg : clean up handling --mmproj with -hf

* rm change about no_mmproj

* Revert "rm change about no_mmproj"

This reverts commit 2cac8e0efb629d66c612f137e75d562f94bb9e6c.

* handle no_mmproj explicitly

* skip download mmproj on examples not using it
---
 common/arg.cpp              | 68 +++++++++++++++++++++++++++----------
 common/common.h             |  1 +
 examples/llava/mtmd-cli.cpp |  1 +
 3 files changed, 53 insertions(+), 17 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 1cfd0168d95ae..85ba411146786 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -38,6 +38,11 @@
 
 using json = nlohmann::ordered_json;
 
+std::initializer_list<enum llama_example> mmproj_examples = {
+    LLAMA_EXAMPLE_LLAVA,
+    // TODO: add LLAMA_EXAMPLE_SERVER when it's ready
+};
+
 common_arg & common_arg::set_examples(std::initializer_list<enum llama_example> examples) {
     this->examples = std::move(examples);
     return *this;
@@ -641,11 +646,16 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s
 // utils
 //
 
-static void common_params_handle_model(
+struct handle_model_result {
+    bool found_mmproj = false;
+    common_params_model mmproj;
+};
+
+static handle_model_result common_params_handle_model(
         struct common_params_model & model,
         const std::string & bearer_token,
-        const std::string & model_path_default,
-        bool is_mmproj = false) { // TODO: move is_mmproj to an enum when we have more files?
+        const std::string & model_path_default) {
+    handle_model_result result;
     // handle pre-fill default model path and url based on hf_repo and hf_file
     {
         if (!model.hf_repo.empty()) {
@@ -657,7 +667,12 @@ static void common_params_handle_model(
                         exit(1); // built without CURL, error message already printed
                     }
                     model.hf_repo = auto_detected.repo;
-                    model.hf_file = is_mmproj ? auto_detected.mmprojFile : auto_detected.ggufFile;
+                    model.hf_file = auto_detected.ggufFile;
+                    if (!auto_detected.mmprojFile.empty()) {
+                        result.found_mmproj   = true;
+                        result.mmproj.hf_repo = model.hf_repo;
+                        result.mmproj.hf_file = auto_detected.mmprojFile;
+                    }
                 } else {
                     model.hf_file = model.path;
                 }
@@ -694,6 +709,8 @@ static void common_params_handle_model(
             exit(1);
         }
     }
+
+    return result;
 }
 
 const std::vector<ggml_type> kv_cache_types = {
@@ -827,16 +844,25 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
         throw std::invalid_argument("error: --prompt-cache-all not supported in interactive mode yet\n");
     }
 
-    common_params_handle_model(params.model,             params.hf_token, DEFAULT_MODEL_PATH);
-    common_params_handle_model(params.speculative.model, params.hf_token, "");
-    common_params_handle_model(params.vocoder.model,     params.hf_token, "");
-
-    // allow --mmproj to be set from -hf
-    // assuming that mmproj is always in the same repo as text model
-    if (!params.model.hf_repo.empty() && ctx_arg.ex == LLAMA_EXAMPLE_LLAVA) {
-        params.mmproj.hf_repo = params.model.hf_repo;
+    // handle model and download
+    {
+        auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
+        if (params.no_mmproj) {
+            params.mmproj = {};
+        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
+            // optionally, handle mmproj model when -hf is specified
+            params.mmproj = res.mmproj;
+        }
+        // only download mmproj if the current example is using it
+        for (auto & ex : mmproj_examples) {
+            if (ctx_arg.ex == ex) {
+                common_params_handle_model(params.mmproj,    params.hf_token, "");
+                break;
+            }
+        }
+        common_params_handle_model(params.speculative.model, params.hf_token, "");
+        common_params_handle_model(params.vocoder.model,     params.hf_token, "");
     }
-    common_params_handle_model(params.mmproj,            params.hf_token, "", true);
 
     if (params.escape) {
         string_process_escapes(params.prompt);
@@ -2095,18 +2121,25 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
     add_opt(common_arg(
         {"--mmproj"}, "FILE",
-        "path to a multimodal projector file for LLaVA. see examples/llava/README.md",
+        "path to a multimodal projector file. see examples/llava/README.md",
         [](common_params & params, const std::string & value) {
             params.mmproj.path = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples(mmproj_examples));
     add_opt(common_arg(
         {"--mmproj-url"}, "URL",
-        "URL to a multimodal projector file for LLaVA. see examples/llava/README.md",
+        "URL to a multimodal projector file. see examples/llava/README.md",
         [](common_params & params, const std::string & value) {
             params.mmproj.url = value;
         }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples(mmproj_examples));
+    add_opt(common_arg(
+        {"--no-mmproj"},
+        "explicitly disable multimodal projector, useful when using -hf",
+        [](common_params & params) {
+            params.no_mmproj = true;
+        }
+    ).set_examples(mmproj_examples));
     add_opt(common_arg(
         {"--image"}, "FILE",
         "path to an image file. use with multimodal models. Specify multiple times for batching",
@@ -2381,6 +2414,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
     add_opt(common_arg(
         {"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
         "Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"
+        "mmproj is also downloaded automatically if available. to disable, add --no-mmproj\n"
         "example: unsloth/phi-4-GGUF:q4_k_m\n"
         "(default: unused)",
         [](common_params & params, const std::string & value) {
diff --git a/common/common.h b/common/common.h
index e6eaa8e80cf05..70d3ef8f27870 100644
--- a/common/common.h
+++ b/common/common.h
@@ -342,6 +342,7 @@ struct common_params {
 
     // multimodal models (see examples/llava)
     struct common_params_model mmproj;
+    bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
 
     // embedding
diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp
index 89af7331a1658..19373760576a9 100644
--- a/examples/llava/mtmd-cli.cpp
+++ b/examples/llava/mtmd-cli.cpp
@@ -261,6 +261,7 @@ int main(int argc, char ** argv) {
 
     if (params.mmproj.path.empty()) {
         show_additional_info(argc, argv);
+        LOG_ERR("ERR: Missing --mmproj argument\n");
         return 1;
     }
 

From 7c727fbe39150fbe8381f4fa43fed08719ebebe6 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Thu, 24 Apr 2025 14:04:14 +0200
Subject: [PATCH 020/200] arg : add --no-mmproj-offload (#13093)

* arg : add --no-mmproj-offload

* Update common/arg.cpp
---
 common/arg.cpp              | 7 +++++++
 common/common.h             | 1 +
 examples/llava/mtmd-cli.cpp | 7 ++++---
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 85ba411146786..9cbf985710112 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2140,6 +2140,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.no_mmproj = true;
         }
     ).set_examples(mmproj_examples));
+    add_opt(common_arg(
+        {"--no-mmproj-offload"},
+        "do not offload multimodal projector to GPU",
+        [](common_params & params) {
+            params.mmproj_use_gpu = false;
+        }
+    ).set_examples(mmproj_examples));
     add_opt(common_arg(
         {"--image"}, "FILE",
         "path to an image file. use with multimodal models. Specify multiple times for batching",
diff --git a/common/common.h b/common/common.h
index 70d3ef8f27870..0a9dc0599f722 100644
--- a/common/common.h
+++ b/common/common.h
@@ -342,6 +342,7 @@ struct common_params {
 
     // multimodal models (see examples/llava)
     struct common_params_model mmproj;
+    bool mmproj_use_gpu = true;     // use GPU for multimodal model
     bool no_mmproj = false;         // explicitly disable multimodal model
     std::vector<std::string> image; // path to image file(s)
 
diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp
index 19373760576a9..250e8c9a9e871 100644
--- a/examples/llava/mtmd-cli.cpp
+++ b/examples/llava/mtmd-cli.cpp
@@ -40,7 +40,8 @@ static void show_additional_info(int /*argc*/, char ** argv) {
         "Usage: %s [options] -m <model> --mmproj <mmproj> --image <image> -p <prompt>\n\n"
         "  -m and --mmproj are required\n"
         "  -hf user/repo can replace both -m and --mmproj in most cases\n"
-        "  --image and -p are optional, if NOT provided, the CLI will run in chat mode\n",
+        "  --image and -p are optional, if NOT provided, the CLI will run in chat mode\n"
+        "  to disable using GPU for mmproj model, add --no-mmproj-offload\n",
         argv[0]
     );
 }
@@ -112,10 +113,10 @@ struct mtmd_cli_context {
     void init_vision_context(common_params & params) {
         const char * clip_path = params.mmproj.path.c_str();
         ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{
-            /* use_gpu */   true,
+            /* use_gpu */   params.mmproj_use_gpu,
             /* timings */   true,
             /* n_threads */ params.cpuparams.n_threads,
-            /* verbosity */ GGML_LOG_LEVEL_INFO,
+            /* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO,
         }));
         if (!ctx_vision.get()) {
             LOG_ERR("Failed to load vision model from %s\n", clip_path);

From 572b3141d343d7f947bf53b57513016e90db5680 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 24 Apr 2025 15:44:05 +0300
Subject: [PATCH 021/200] clang-tidy : disable warning about missing math
 parenthesis (#13091)

---
 .clang-tidy | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.clang-tidy b/.clang-tidy
index 310c3d182c8f2..5bc63bc6e27b6 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -13,6 +13,7 @@ Checks: >
     -readability-magic-numbers,
     -readability-uppercase-literal-suffix,
     -readability-simplify-boolean-expr,
+    -readability-math-missing-parentheses,
     clang-analyzer-*,
     -clang-analyzer-security.insecureAPI.DeprecatedOrUnsafeBufferHandling,
     performance-*,

From 13b4548877326fdabee3e831b8cfd65d9844383c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 24 Apr 2025 16:00:10 +0300
Subject: [PATCH 022/200] cmake : do not include ./src as public for libllama
 (#13062)

* cmake : do not include ./src as public for libllama

ggml-ci

* cmake : rework tests

ggml-ci

* llguidance : remove unicode include

ggml-ci

* cmake : make c++17 private

ggml-ci
---
 common/arg.cpp                                |  2 -
 examples/CMakeLists.txt                       |  9 ---
 examples/gbnf-validator/CMakeLists.txt        |  5 --
 examples/quantize-stats/CMakeLists.txt        |  6 --
 grammars/README.md                            |  2 +-
 src/CMakeLists.txt                            |  5 +-
 tests/CMakeLists.txt                          | 69 +++++++++++--------
 tests/test-chat.cpp                           |  5 +-
 .../test-gbnf-validator.cpp                   |  4 +-
 tests/test-grammar-integration.cpp            |  5 +-
 tests/test-grammar-llguidance.cpp             |  3 +-
 tests/test-grammar-parser.cpp                 |  4 +-
 tests/test-json-schema-to-grammar.cpp         |  2 +-
 tests/test-llama-grammar.cpp                  |  3 +-
 .../test-quantize-stats.cpp                   |  3 +-
 tests/test-tokenizer-1-bpe.cpp                |  3 +-
 tests/test-tokenizer-1-spm.cpp                |  3 +-
 17 files changed, 64 insertions(+), 69 deletions(-)
 delete mode 100644 examples/gbnf-validator/CMakeLists.txt
 delete mode 100644 examples/quantize-stats/CMakeLists.txt
 rename examples/gbnf-validator/gbnf-validator.cpp => tests/test-gbnf-validator.cpp (98%)
 rename examples/quantize-stats/quantize-stats.cpp => tests/test-quantize-stats.cpp (99%)

diff --git a/common/arg.cpp b/common/arg.cpp
index 9cbf985710112..0657553e4e9cf 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -994,7 +994,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
         "llama-embedding",
         "llama-eval-callback",
         "llama-export-lora",
-        "llama-gbnf-validator",
         "llama-gen-docs",
         "llama-gguf",
         "llama-gguf-hash",
@@ -1014,7 +1013,6 @@ static void common_params_print_completion(common_params_context & ctx_arg) {
         "llama-perplexity",
         "llama-q8dot",
         "llama-quantize",
-        "llama-quantize-stats",
         "llama-qwen2vl-cli",
         "llama-retrieval",
         "llama-run",
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 66cfab2c3b796..37476f9043e78 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -21,11 +21,6 @@ else()
     add_subdirectory(embedding)
     add_subdirectory(eval-callback)
 
-    if (NOT WIN32)
-        # disabled on Windows because it uses internal functions not exported with LLAMA_API
-        add_subdirectory(gbnf-validator)
-    endif()
-
     add_subdirectory(gguf-hash)
     add_subdirectory(gguf-split)
     add_subdirectory(gguf)
@@ -58,10 +53,6 @@ else()
         add_subdirectory(convert-llama2c-to-ggml)
         add_subdirectory(cvector-generator)
         add_subdirectory(export-lora)
-        if (NOT WIN32)
-            # disabled on Windows because it uses internal functions not exported with LLAMA_API
-            add_subdirectory(quantize-stats)
-        endif()
         add_subdirectory(llava)
         if (GGML_RPC)
             add_subdirectory(rpc)
diff --git a/examples/gbnf-validator/CMakeLists.txt b/examples/gbnf-validator/CMakeLists.txt
deleted file mode 100644
index d2cb524c0a7f7..0000000000000
--- a/examples/gbnf-validator/CMakeLists.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-set(TARGET llama-gbnf-validator)
-add_executable(${TARGET} gbnf-validator.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/examples/quantize-stats/CMakeLists.txt b/examples/quantize-stats/CMakeLists.txt
deleted file mode 100644
index 9a3a0d3cd2dee..0000000000000
--- a/examples/quantize-stats/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-set(TARGET llama-quantize-stats)
-add_executable(${TARGET} quantize-stats.cpp)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT})
-target_include_directories(${TARGET} PRIVATE ../../common)
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
diff --git a/grammars/README.md b/grammars/README.md
index 935213f5c1849..5aa12acc1bff3 100644
--- a/grammars/README.md
+++ b/grammars/README.md
@@ -112,7 +112,7 @@ You can use GBNF grammars:
 
 - In [llama-server](../examples/server)'s completion endpoints, passed as the `grammar` body field
 - In [llama-cli](../examples/main), passed as the `--grammar` & `--grammar-file` flags
-- With [llama-gbnf-validator](../examples/gbnf-validator) tool, to test them against strings.
+- With [test-gbnf-validator](../tests/test-gbnf-validator.cpp), to test them against strings.
 
 ## JSON Schemas → GBNF
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 9f7ab13f1e620..1cd316b03e132 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -32,8 +32,9 @@ add_library(llama
             unicode.h
             )
 
-target_include_directories(llama PUBLIC . ../include)
-target_compile_features   (llama PUBLIC cxx_std_17) # don't bump
+target_include_directories(llama PRIVATE .)
+target_include_directories(llama PUBLIC ../include)
+target_compile_features   (llama PRIVATE cxx_std_17) # don't bump
 
 target_link_libraries(llama PUBLIC ggml)
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 2bb210702aef8..ae68275251d01 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -1,5 +1,17 @@
 llama_add_compile_flags()
 
+function(llama_build source)
+    if (DEFINED LLAMA_TEST_NAME)
+        set(TEST_TARGET ${LLAMA_TEST_NAME})
+    else()
+        get_filename_component(TEST_TARGET ${source} NAME_WE)
+    endif()
+
+    add_executable(${TEST_TARGET} ${source})
+    target_link_libraries(${TEST_TARGET} PRIVATE common)
+    install(TARGETS ${TEST_TARGET} RUNTIME)
+endfunction()
+
 function(llama_test target)
     include(CMakeParseArguments)
     set(options)
@@ -36,7 +48,7 @@ endfunction()
 # - LABEL: label for the test (defaults to main)
 # - ARGS: arguments to pass to the test executable
 # - WORKING_DIRECTORY
-function(llama_target_and_test source)
+function(llama_build_and_test source)
     include(CMakeParseArguments)
     set(options)
     set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
@@ -58,6 +70,7 @@ function(llama_target_and_test source)
     add_executable(${TEST_TARGET} ${source} get-model.cpp)
     install(TARGETS ${TEST_TARGET} RUNTIME)
     target_link_libraries(${TEST_TARGET} PRIVATE common)
+
     add_test(
         NAME ${TEST_TARGET}
         WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
@@ -68,9 +81,7 @@ function(llama_target_and_test source)
 endfunction()
 
 # build test-tokenizer-0 target once and add many tests
-add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
-target_link_libraries(test-tokenizer-0 PRIVATE common)
-install(TARGETS test-tokenizer-0 RUNTIME)
+llama_build(test-tokenizer-0.cpp)
 
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge          ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
@@ -87,27 +98,27 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE
 llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 
 if (LLAMA_LLGUIDANCE)
-    llama_target_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
+    llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
 endif ()
 
 if (NOT WIN32)
     # these tests are disabled on Windows because they use internal functions not exported with LLAMA_API
-    llama_target_and_test(test-sampling.cpp)
-    llama_target_and_test(test-grammar-parser.cpp)
-    llama_target_and_test(test-grammar-integration.cpp)
-    llama_target_and_test(test-llama-grammar.cpp)
-    llama_target_and_test(test-chat.cpp)
+    llama_build_and_test(test-sampling.cpp)
+    llama_build_and_test(test-grammar-parser.cpp)
+    llama_build_and_test(test-grammar-integration.cpp)
+    llama_build_and_test(test-llama-grammar.cpp)
+    llama_build_and_test(test-chat.cpp)
     # TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
     if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
-        llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
+        llama_build_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
         target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
     endif()
 
+    llama_build(test-quantize-stats.cpp)
+    llama_build(test-gbnf-validator.cpp)
 
     # build test-tokenizer-1-bpe target once and add many tests
-    add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
-    target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
-    install(TARGETS test-tokenizer-1-bpe RUNTIME)
+    llama_build(test-tokenizer-1-bpe.cpp)
 
     # TODO: disabled due to slowness
     #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
@@ -120,37 +131,35 @@ if (NOT WIN32)
     #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 
     # build test-tokenizer-1-spm target once and add many tests
-    add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
-    target_link_libraries(test-tokenizer-1-spm PRIVATE common)
-    install(TARGETS test-tokenizer-1-spm RUNTIME)
+    llama_build(test-tokenizer-1-spm.cpp)
 
     llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
     #llama_test(test-tokenizer-1-spm  NAME test-tokenizer-1-baichuan  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
 
-    # llama_target_and_test(test-double-float.cpp) # SLOW
+    # llama_build_and_test(test-double-float.cpp) # SLOW
 endif()
 
-llama_target_and_test(test-log.cpp)
-llama_target_and_test(test-chat-template.cpp)
+llama_build_and_test(test-log.cpp)
+llama_build_and_test(test-chat-template.cpp)
 
 # this fails on windows (github hosted runner) due to curl DLL not found (exit code 0xc0000135)
 if (NOT WIN32)
-    llama_target_and_test(test-arg-parser.cpp)
+    llama_build_and_test(test-arg-parser.cpp)
 endif()
 
-# llama_target_and_test(test-opt.cpp) # SLOW
-llama_target_and_test(test-gguf.cpp)
-llama_target_and_test(test-backend-ops.cpp)
+# llama_build_and_test(test-opt.cpp) # SLOW
+llama_build_and_test(test-gguf.cpp)
+llama_build_and_test(test-backend-ops.cpp)
 
-llama_target_and_test(test-model-load-cancel.cpp  LABEL "model")
-llama_target_and_test(test-autorelease.cpp        LABEL "model")
+llama_build_and_test(test-model-load-cancel.cpp  LABEL "model")
+llama_build_and_test(test-autorelease.cpp        LABEL "model")
 
 if (NOT GGML_BACKEND_DL)
     # these tests use the backends directly and cannot be built with dynamic loading
-    llama_target_and_test(test-barrier.cpp)
-    llama_target_and_test(test-quantize-fns.cpp)
-    llama_target_and_test(test-quantize-perf.cpp)
-    llama_target_and_test(test-rope.cpp)
+    llama_build_and_test(test-barrier.cpp)
+    llama_build_and_test(test-quantize-fns.cpp)
+    llama_build_and_test(test-quantize-perf.cpp)
+    llama_build_and_test(test-rope.cpp)
 endif()
 
 
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index a0bf6affe5220..fa7aed82dfaa8 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -11,8 +11,9 @@
 #include <string>
 
 #include "chat.h"
-#include "llama-grammar.h"
-#include "unicode.h"
+
+#include "../src/unicode.h"
+#include "../src/llama-grammar.h"
 
 using json = nlohmann::ordered_json;
 
diff --git a/examples/gbnf-validator/gbnf-validator.cpp b/tests/test-gbnf-validator.cpp
similarity index 98%
rename from examples/gbnf-validator/gbnf-validator.cpp
rename to tests/test-gbnf-validator.cpp
index a610e6a0b19d7..6547eec32fab4 100644
--- a/examples/gbnf-validator/gbnf-validator.cpp
+++ b/tests/test-gbnf-validator.cpp
@@ -1,5 +1,5 @@
-#include "unicode.h"
-#include "llama-grammar.h"
+#include "../src/unicode.h"
+#include "../src/llama-grammar.h"
 
 #include <cstdio>
 #include <cstdlib>
diff --git a/tests/test-grammar-integration.cpp b/tests/test-grammar-integration.cpp
index 89060864894a4..8988c347e3e32 100644
--- a/tests/test-grammar-integration.cpp
+++ b/tests/test-grammar-integration.cpp
@@ -2,10 +2,11 @@
 #undef NDEBUG
 #endif
 
-#include "unicode.h"
-#include "llama-grammar.h"
 #include "json-schema-to-grammar.h"
 
+#include "../src/unicode.h"
+#include "../src/llama-grammar.h"
+
 #include <cassert>
 #include <string>
 #include <vector>
diff --git a/tests/test-grammar-llguidance.cpp b/tests/test-grammar-llguidance.cpp
index 3c19220e11964..566b039a07038 100644
--- a/tests/test-grammar-llguidance.cpp
+++ b/tests/test-grammar-llguidance.cpp
@@ -2,7 +2,6 @@
 #    undef NDEBUG
 #endif
 
-#include "unicode.h"
 #include "sampling.h"
 
 #include <cassert>
@@ -84,7 +83,7 @@ static void test(const std::string & test_desc, const std::string & grammar_str,
 
             fprintf(stderr,
                     "\n NOTE: Debug grammar file generated. To analyze this failure in detail, run the following "
-                    "command:     ./llama-gbnf-validator test-grammar-integration.grammar.gbnf "
+                    "command:     ./test-gbnf-validator test-grammar-integration.grammar.gbnf "
                     "test-grammar-integration.string.txt\n\n");
         } else {
             fprintf(stdout, "✅︎\n");
diff --git a/tests/test-grammar-parser.cpp b/tests/test-grammar-parser.cpp
index 259172d999c78..67821a2d5c609 100644
--- a/tests/test-grammar-parser.cpp
+++ b/tests/test-grammar-parser.cpp
@@ -3,7 +3,9 @@
 #endif
 
 #include "llama.h"
-#include "llama-grammar.h"
+
+// TODO: shold not include libllama sources
+#include "../src/llama-grammar.h"
 
 #include <cassert>
 
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index 4d78e914269f3..e35134f3cb063 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -4,7 +4,7 @@
 
 #include "json-schema-to-grammar.h"
 
-#include "llama-grammar.h"
+#include "../src/llama-grammar.h"
 
 #include <cassert>
 #include <fstream>
diff --git a/tests/test-llama-grammar.cpp b/tests/test-llama-grammar.cpp
index e2129206be156..cc198f3e3c903 100644
--- a/tests/test-llama-grammar.cpp
+++ b/tests/test-llama-grammar.cpp
@@ -3,7 +3,8 @@
 #endif
 
 #include "llama.h"
-#include "llama-grammar.h"
+
+#include "../src/llama-grammar.h"
 
 #include <cassert>
 #include <stdexcept>
diff --git a/examples/quantize-stats/quantize-stats.cpp b/tests/test-quantize-stats.cpp
similarity index 99%
rename from examples/quantize-stats/quantize-stats.cpp
rename to tests/test-quantize-stats.cpp
index dd07ab9b37456..db01059119e9b 100644
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/tests/test-quantize-stats.cpp
@@ -1,8 +1,9 @@
 #include "ggml.h"
 #include "llama.h"
-#include "llama-model.h"
 #include "common.h"
 
+#include "../src/llama-model.h"
+
 #include <algorithm>
 #include <cassert>
 #include <cinttypes>
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp
index 55425d88a7e07..b183da47f3cc8 100644
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -1,8 +1,9 @@
 #include "llama.h"
 #include "common.h"
-#include "unicode.h"
 #include "console.h"
 
+#include "../src/unicode.h"
+
 #include <cassert>
 #include <codecvt>
 #include <cstdio>
diff --git a/tests/test-tokenizer-1-spm.cpp b/tests/test-tokenizer-1-spm.cpp
index 9e7b77f31ea12..ba6e94ba8ea57 100644
--- a/tests/test-tokenizer-1-spm.cpp
+++ b/tests/test-tokenizer-1-spm.cpp
@@ -1,8 +1,9 @@
 #include "llama.h"
 #include "common.h"
-#include "unicode.h"
 #include "console.h"
 
+#include "../src/unicode.h"
+
 #include <cassert>
 #include <codecvt>
 #include <cstdio>

From b10d8bfdb1dac40cce34e8860ca5ec7d950c3a44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Thu, 24 Apr 2025 15:57:10 +0200
Subject: [PATCH 023/200] CUDA: use switch statements in constexpr functions
 (#13095)

---
 ggml/src/ggml-cuda/mmq.cuh | 80 ++++++++++++++++++++------------------
 ggml/src/ggml-cuda/mmvq.cu | 80 ++++++++++++++++++++------------------
 2 files changed, 84 insertions(+), 76 deletions(-)

diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh
index 532358018f410..3cb2015520ba1 100644
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -155,25 +155,27 @@ static constexpr __device__ int get_mmq_y_device() {
 #define MMQ_DP4A_TXS_Q6_K    tile_x_sizes{mmq_y*WARP_SIZE*2 + mmq_y, mmq_y*WARP_SIZE/QI6_K   + mmq_y/QI6_K,     mmq_y*WARP_SIZE/8 + mmq_y/8}
 
 static constexpr __host__ __device__ tile_x_sizes mmq_get_dp4a_tile_x_sizes(ggml_type type, int mmq_y) {
-    return type == GGML_TYPE_Q4_0 ? MMQ_DP4A_TXS_Q4_0 :
-        type == GGML_TYPE_Q4_1    ? MMQ_DP4A_TXS_Q4_1 :
-        type == GGML_TYPE_Q5_0    ? MMQ_DP4A_TXS_Q8_0 :
-        type == GGML_TYPE_Q5_1    ? MMQ_DP4A_TXS_Q8_1 :
-        type == GGML_TYPE_Q8_0    ? MMQ_DP4A_TXS_Q8_0 :
-        type == GGML_TYPE_Q2_K    ? MMQ_DP4A_TXS_Q2_K :
-        type == GGML_TYPE_Q3_K    ? MMQ_DP4A_TXS_Q3_K :
-        type == GGML_TYPE_Q4_K    ? MMQ_DP4A_TXS_Q4_K :
-        type == GGML_TYPE_Q5_K    ? MMQ_DP4A_TXS_Q5_K :
-        type == GGML_TYPE_Q6_K    ? MMQ_DP4A_TXS_Q6_K :
-        type == GGML_TYPE_IQ2_XXS ? MMQ_DP4A_TXS_Q8_0 :
-        type == GGML_TYPE_IQ2_XS  ? MMQ_DP4A_TXS_Q8_0_16 :
-        type == GGML_TYPE_IQ2_S   ? MMQ_DP4A_TXS_Q8_0_16 :
-        type == GGML_TYPE_IQ3_XXS ? MMQ_DP4A_TXS_Q8_0 :
-        type == GGML_TYPE_IQ3_S   ? MMQ_DP4A_TXS_Q8_0 :
-        type == GGML_TYPE_IQ1_S   ? MMQ_DP4A_TXS_Q8_0 :
-        type == GGML_TYPE_IQ4_XS  ? MMQ_DP4A_TXS_Q8_0 :
-        type == GGML_TYPE_IQ4_NL  ? MMQ_DP4A_TXS_Q8_0 :
-        tile_x_sizes{0, 0, 0};
+    switch (type) {
+        case GGML_TYPE_Q4_0:    return MMQ_DP4A_TXS_Q4_0;
+        case GGML_TYPE_Q4_1:    return MMQ_DP4A_TXS_Q4_1;
+        case GGML_TYPE_Q5_0:    return MMQ_DP4A_TXS_Q8_0;
+        case GGML_TYPE_Q5_1:    return MMQ_DP4A_TXS_Q8_1;
+        case GGML_TYPE_Q8_0:    return MMQ_DP4A_TXS_Q8_0;
+        case GGML_TYPE_Q2_K:    return MMQ_DP4A_TXS_Q2_K;
+        case GGML_TYPE_Q3_K:    return MMQ_DP4A_TXS_Q3_K;
+        case GGML_TYPE_Q4_K:    return MMQ_DP4A_TXS_Q4_K;
+        case GGML_TYPE_Q5_K:    return MMQ_DP4A_TXS_Q5_K;
+        case GGML_TYPE_Q6_K:    return MMQ_DP4A_TXS_Q6_K;
+        case GGML_TYPE_IQ2_XXS: return MMQ_DP4A_TXS_Q8_0;
+        case GGML_TYPE_IQ2_XS:  return MMQ_DP4A_TXS_Q8_0_16;
+        case GGML_TYPE_IQ2_S:   return MMQ_DP4A_TXS_Q8_0_16;
+        case GGML_TYPE_IQ3_XXS: return MMQ_DP4A_TXS_Q8_0;
+        case GGML_TYPE_IQ3_S:   return MMQ_DP4A_TXS_Q8_0;
+        case GGML_TYPE_IQ1_S:   return MMQ_DP4A_TXS_Q8_0;
+        case GGML_TYPE_IQ4_XS:  return MMQ_DP4A_TXS_Q8_0;
+        case GGML_TYPE_IQ4_NL:  return MMQ_DP4A_TXS_Q8_0;
+        default:                return tile_x_sizes{0, 0, 0};
+    }
 }
 
 #define MMQ_MMA_TILE_X_K_Q8_0 (2*WARP_SIZE + 2*WARP_SIZE/QI8_0                 + 4)
@@ -189,25 +191,27 @@ static_assert(MMQ_MMA_TILE_X_K_Q3_K % 8 == 4, "Wrong padding.");
 static_assert(MMQ_MMA_TILE_X_K_Q6_K % 8 == 4, "Wrong padding.");
 
 static constexpr __host__ __device__ int mmq_get_mma_tile_x_k(ggml_type type) {
-    return type == GGML_TYPE_Q4_0 ? MMQ_MMA_TILE_X_K_Q8_0 :
-        type == GGML_TYPE_Q4_1    ? MMQ_MMA_TILE_X_K_Q8_1 :
-        type == GGML_TYPE_Q5_0    ? MMQ_MMA_TILE_X_K_Q8_0 :
-        type == GGML_TYPE_Q5_1    ? MMQ_MMA_TILE_X_K_Q8_1 :
-        type == GGML_TYPE_Q8_0    ? MMQ_MMA_TILE_X_K_Q8_0 :
-        type == GGML_TYPE_Q2_K    ? MMQ_MMA_TILE_X_K_Q2_K :
-        type == GGML_TYPE_Q3_K    ? MMQ_MMA_TILE_X_K_Q3_K :
-        type == GGML_TYPE_Q4_K    ? MMQ_MMA_TILE_X_K_Q8_1 :
-        type == GGML_TYPE_Q5_K    ? MMQ_MMA_TILE_X_K_Q8_1 :
-        type == GGML_TYPE_Q6_K    ? MMQ_MMA_TILE_X_K_Q6_K :
-        type == GGML_TYPE_IQ2_XXS ? MMQ_MMA_TILE_X_K_Q8_0 :
-        type == GGML_TYPE_IQ2_XS  ? MMQ_MMA_TILE_X_K_Q3_K :
-        type == GGML_TYPE_IQ2_S   ? MMQ_MMA_TILE_X_K_Q3_K :
-        type == GGML_TYPE_IQ3_XXS ? MMQ_MMA_TILE_X_K_Q8_0 :
-        type == GGML_TYPE_IQ3_S   ? MMQ_MMA_TILE_X_K_Q8_0 :
-        type == GGML_TYPE_IQ1_S   ? MMQ_MMA_TILE_X_K_Q8_0 :
-        type == GGML_TYPE_IQ4_XS  ? MMQ_MMA_TILE_X_K_Q8_0 :
-        type == GGML_TYPE_IQ4_NL  ? MMQ_MMA_TILE_X_K_Q8_0 :
-        0;
+    switch (type) {
+        case GGML_TYPE_Q4_0:    return MMQ_MMA_TILE_X_K_Q8_0;
+        case GGML_TYPE_Q4_1:    return MMQ_MMA_TILE_X_K_Q8_1;
+        case GGML_TYPE_Q5_0:    return MMQ_MMA_TILE_X_K_Q8_0;
+        case GGML_TYPE_Q5_1:    return MMQ_MMA_TILE_X_K_Q8_1;
+        case GGML_TYPE_Q8_0:    return MMQ_MMA_TILE_X_K_Q8_0;
+        case GGML_TYPE_Q2_K:    return MMQ_MMA_TILE_X_K_Q2_K;
+        case GGML_TYPE_Q3_K:    return MMQ_MMA_TILE_X_K_Q3_K;
+        case GGML_TYPE_Q4_K:    return MMQ_MMA_TILE_X_K_Q8_1;
+        case GGML_TYPE_Q5_K:    return MMQ_MMA_TILE_X_K_Q8_1;
+        case GGML_TYPE_Q6_K:    return MMQ_MMA_TILE_X_K_Q6_K;
+        case GGML_TYPE_IQ2_XXS: return MMQ_MMA_TILE_X_K_Q8_0;
+        case GGML_TYPE_IQ2_XS:  return MMQ_MMA_TILE_X_K_Q3_K;
+        case GGML_TYPE_IQ2_S:   return MMQ_MMA_TILE_X_K_Q3_K;
+        case GGML_TYPE_IQ3_XXS: return MMQ_MMA_TILE_X_K_Q8_0;
+        case GGML_TYPE_IQ3_S:   return MMQ_MMA_TILE_X_K_Q8_0;
+        case GGML_TYPE_IQ1_S:   return MMQ_MMA_TILE_X_K_Q8_0;
+        case GGML_TYPE_IQ4_XS:  return MMQ_MMA_TILE_X_K_Q8_0;
+        case GGML_TYPE_IQ4_NL:  return MMQ_MMA_TILE_X_K_Q8_0;
+        default:                return 0;
+    }
 }
 
 #define MMQ_TILE_Y_K (WARP_SIZE + WARP_SIZE/QI8_1)
diff --git a/ggml/src/ggml-cuda/mmvq.cu b/ggml/src/ggml-cuda/mmvq.cu
index cac04916cd8f0..d846e35a6a26d 100644
--- a/ggml/src/ggml-cuda/mmvq.cu
+++ b/ggml/src/ggml-cuda/mmvq.cu
@@ -7,47 +7,51 @@
 typedef float (*vec_dot_q_cuda_t)(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & kbx, const int & iqs);
 
 static constexpr __device__ vec_dot_q_cuda_t get_vec_dot_q_cuda(ggml_type type) {
-    return type == GGML_TYPE_Q4_0 ? vec_dot_q4_0_q8_1 :
-        type == GGML_TYPE_Q4_1 ? vec_dot_q4_1_q8_1 :
-        type == GGML_TYPE_Q5_0 ? vec_dot_q5_0_q8_1 :
-        type == GGML_TYPE_Q5_1 ? vec_dot_q5_1_q8_1 :
-        type == GGML_TYPE_Q8_0 ? vec_dot_q8_0_q8_1 :
-        type == GGML_TYPE_Q2_K ? vec_dot_q2_K_q8_1 :
-        type == GGML_TYPE_Q3_K ? vec_dot_q3_K_q8_1 :
-        type == GGML_TYPE_Q4_K ? vec_dot_q4_K_q8_1 :
-        type == GGML_TYPE_Q5_K ? vec_dot_q5_K_q8_1 :
-        type == GGML_TYPE_Q6_K ? vec_dot_q6_K_q8_1 :
-        type == GGML_TYPE_IQ2_XXS ? vec_dot_iq2_xxs_q8_1 :
-        type == GGML_TYPE_IQ2_XS ? vec_dot_iq2_xs_q8_1 :
-        type == GGML_TYPE_IQ2_S ? vec_dot_iq2_s_q8_1 :
-        type == GGML_TYPE_IQ3_XXS ? vec_dot_iq3_xxs_q8_1 :
-        type == GGML_TYPE_IQ1_S ? vec_dot_iq1_s_q8_1 :
-        type == GGML_TYPE_IQ1_M ? vec_dot_iq1_m_q8_1 :
-        type == GGML_TYPE_IQ4_NL ? vec_dot_iq4_nl_q8_1 :
-        type == GGML_TYPE_IQ4_XS ? vec_dot_iq4_xs_q8_1 :
-        type == GGML_TYPE_IQ3_S ? vec_dot_iq3_s_q8_1 :
-        nullptr;
+    switch (type) {
+        case GGML_TYPE_Q4_0:    return vec_dot_q4_0_q8_1;
+        case GGML_TYPE_Q4_1:    return vec_dot_q4_1_q8_1;
+        case GGML_TYPE_Q5_0:    return vec_dot_q5_0_q8_1;
+        case GGML_TYPE_Q5_1:    return vec_dot_q5_1_q8_1;
+        case GGML_TYPE_Q8_0:    return vec_dot_q8_0_q8_1;
+        case GGML_TYPE_Q2_K:    return vec_dot_q2_K_q8_1;
+        case GGML_TYPE_Q3_K:    return vec_dot_q3_K_q8_1;
+        case GGML_TYPE_Q4_K:    return vec_dot_q4_K_q8_1;
+        case GGML_TYPE_Q5_K:    return vec_dot_q5_K_q8_1;
+        case GGML_TYPE_Q6_K:    return vec_dot_q6_K_q8_1;
+        case GGML_TYPE_IQ2_XXS: return vec_dot_iq2_xxs_q8_1;
+        case GGML_TYPE_IQ2_XS:  return vec_dot_iq2_xs_q8_1;
+        case GGML_TYPE_IQ2_S:   return vec_dot_iq2_s_q8_1;
+        case GGML_TYPE_IQ3_XXS: return vec_dot_iq3_xxs_q8_1;
+        case GGML_TYPE_IQ1_S:   return vec_dot_iq1_s_q8_1;
+        case GGML_TYPE_IQ1_M:   return vec_dot_iq1_m_q8_1;
+        case GGML_TYPE_IQ4_NL:  return vec_dot_iq4_nl_q8_1;
+        case GGML_TYPE_IQ4_XS:  return vec_dot_iq4_xs_q8_1;
+        case GGML_TYPE_IQ3_S:   return vec_dot_iq3_s_q8_1;
+        default:                return nullptr;
+    }
 }
 
 static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
-    return type == GGML_TYPE_Q4_0 ? VDR_Q4_0_Q8_1_MMVQ :
-        type == GGML_TYPE_Q4_1    ? VDR_Q4_1_Q8_1_MMVQ :
-        type == GGML_TYPE_Q5_0    ? VDR_Q5_0_Q8_1_MMVQ :
-        type == GGML_TYPE_Q5_1    ? VDR_Q5_1_Q8_1_MMVQ :
-        type == GGML_TYPE_Q8_0    ? VDR_Q8_0_Q8_1_MMVQ :
-        type == GGML_TYPE_Q2_K    ? VDR_Q2_K_Q8_1_MMVQ :
-        type == GGML_TYPE_Q3_K    ? VDR_Q3_K_Q8_1_MMVQ :
-        type == GGML_TYPE_Q4_K    ? VDR_Q4_K_Q8_1_MMVQ :
-        type == GGML_TYPE_Q5_K    ? VDR_Q5_K_Q8_1_MMVQ :
-        type == GGML_TYPE_Q6_K    ? VDR_Q6_K_Q8_1_MMVQ :
-        type == GGML_TYPE_IQ2_XXS ? VDR_IQ2_XXS_Q8_1_MMVQ :
-        type == GGML_TYPE_IQ2_XS  ? VDR_IQ2_XS_Q8_1_MMVQ :
-        type == GGML_TYPE_IQ2_S   ? VDR_IQ2_S_Q8_1_MMVQ :
-        type == GGML_TYPE_IQ3_XXS ? VDR_IQ3_XXS_Q8_1_MMVQ :
-        type == GGML_TYPE_IQ3_S   ? VDR_IQ3_S_Q8_1_MMVQ :
-        type == GGML_TYPE_IQ4_NL  ? VDR_IQ4_NL_Q8_1_MMVQ :
-        type == GGML_TYPE_IQ4_XS  ? VDR_IQ4_XS_Q8_1_MMVQ :
-        1;
+    switch (type) {
+        case GGML_TYPE_Q4_0:    return VDR_Q4_0_Q8_1_MMVQ;
+        case GGML_TYPE_Q4_1:    return VDR_Q4_1_Q8_1_MMVQ;
+        case GGML_TYPE_Q5_0:    return VDR_Q5_0_Q8_1_MMVQ;
+        case GGML_TYPE_Q5_1:    return VDR_Q5_1_Q8_1_MMVQ;
+        case GGML_TYPE_Q8_0:    return VDR_Q8_0_Q8_1_MMVQ;
+        case GGML_TYPE_Q2_K:    return VDR_Q2_K_Q8_1_MMVQ;
+        case GGML_TYPE_Q3_K:    return VDR_Q3_K_Q8_1_MMVQ;
+        case GGML_TYPE_Q4_K:    return VDR_Q4_K_Q8_1_MMVQ;
+        case GGML_TYPE_Q5_K:    return VDR_Q5_K_Q8_1_MMVQ;
+        case GGML_TYPE_Q6_K:    return VDR_Q6_K_Q8_1_MMVQ;
+        case GGML_TYPE_IQ2_XXS: return VDR_IQ2_XXS_Q8_1_MMVQ;
+        case GGML_TYPE_IQ2_XS:  return VDR_IQ2_XS_Q8_1_MMVQ;
+        case GGML_TYPE_IQ2_S:   return VDR_IQ2_S_Q8_1_MMVQ;
+        case GGML_TYPE_IQ3_XXS: return VDR_IQ3_XXS_Q8_1_MMVQ;
+        case GGML_TYPE_IQ3_S:   return VDR_IQ3_S_Q8_1_MMVQ;
+        case GGML_TYPE_IQ4_NL:  return VDR_IQ4_NL_Q8_1_MMVQ;
+        case GGML_TYPE_IQ4_XS:  return VDR_IQ4_XS_Q8_1_MMVQ;
+        default:                return 1;
+    }
 }
 
 enum mmvq_parameter_table_id {

From c6e8cc28c15166dba15629dba6a7366d4d5955ca Mon Sep 17 00:00:00 2001
From: Acly <aclysia@gmail.com>
Date: Thu, 17 Apr 2025 14:16:45 +0200
Subject: [PATCH 024/200] ggml : Depthwise 2D convolution (ggml/1152)

* ggml-cpu : kernels for faster depthwise 2D convolution

* fix compile: remove static after moving to ops.cpp

* add dilation for depthwise_conv_2d

* review: rename to ggml_conv_2d_dw_direct, remove redundant struct keywords, pass by ref, whitespace

* review: rename depthwise_conv_2d -> conv_2d_dw everywhere
---
 ggml/include/ggml.h          |  22 ++++-
 ggml/src/ggml-cpu/ggml-cpu.c |   5 +
 ggml/src/ggml-cpu/ops.cpp    | 172 +++++++++++++++++++++++++++++++++++
 ggml/src/ggml-cpu/ops.h      |   1 +
 ggml/src/ggml.c              |  53 ++++++++++-
 5 files changed, 250 insertions(+), 3 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 8fcc16df998be..51aa5b3a0ab44 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -481,6 +481,7 @@ extern "C" {
         GGML_OP_CONV_TRANSPOSE_1D,
         GGML_OP_IM2COL,
         GGML_OP_IM2COL_BACK,
+        GGML_OP_CONV_2D_DW,
         GGML_OP_CONV_TRANSPOSE_2D,
         GGML_OP_POOL_1D,
         GGML_OP_POOL_2D,
@@ -677,6 +678,9 @@ extern "C" {
     GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
     GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
 
+    // true for tensor that is stored in memory as CxWxHxN and has been permuted to WxHxCxN
+    GGML_API bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor);
+
     GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
     GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
 
@@ -1660,7 +1664,7 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
-    // depthwise
+    // depthwise (via im2col and mul_mat)
     GGML_API struct ggml_tensor * ggml_conv_2d_dw(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,  // convolution kernel
@@ -1672,6 +1676,22 @@ extern "C" {
             int                  d0,  // dilation dimension 0
             int                  d1); // dilation dimension 1
 
+    // Depthwise 2D convolution
+    // may be faster than ggml_conv_2d_dw, but not available in all backends
+    // a:   KW    KH    1    C    convolution kernel
+    // b:   W     H     C    N    input data
+    // res: W_out H_out C    N
+    GGML_API struct ggml_tensor * ggml_conv_2d_dw_direct(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            int                   stride0,
+            int                   stride1,
+            int                   pad0,
+            int                   pad1,
+            int                   dilation0,
+            int                   dilation1);
+
     GGML_API struct ggml_tensor * ggml_conv_transpose_2d_p0(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index 50400328738ef..dbad8f61a1e92 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -1932,6 +1932,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_im2col_back_f32(params, tensor);
             } break;
+        case GGML_OP_CONV_2D_DW:
+            {
+                ggml_compute_forward_conv_2d_dw(params, tensor);
+            } break;
         case GGML_OP_CONV_TRANSPOSE_2D:
             {
                 ggml_compute_forward_conv_transpose_2d(params, tensor);
@@ -2268,6 +2272,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             } break;
         case GGML_OP_IM2COL:
         case GGML_OP_IM2COL_BACK:
+        case GGML_OP_CONV_2D_DW:
         case GGML_OP_CONV_TRANSPOSE_1D:
         case GGML_OP_CONV_TRANSPOSE_2D:
             {
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 6050147be70ac..3c2adb217267b 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -6064,6 +6064,178 @@ void ggml_compute_forward_conv_transpose_2d(
     }
 }
 
+// ggml_compute_forward_conv_2d_dw
+
+struct ggml_conv_2d_dw_params {
+    int64_t channels;
+    int64_t batch;
+    int64_t src_w;
+    int64_t src_h;
+    int64_t dst_w;
+    int64_t dst_h;
+    int64_t knl_w;
+    int64_t knl_h;
+    int stride_x;
+    int stride_y;
+    int pad_x;
+    int pad_y;
+    int dilation_x;
+    int dilation_y;
+};
+
+static void ggml_compute_forward_conv_2d_dw_cwhn(
+        const ggml_compute_params * params,
+        const ggml_tensor * src,
+        const ggml_tensor * kernel,
+        ggml_tensor * dst,
+        const ggml_conv_2d_dw_params & p) {
+
+    const int64_t c = p.channels;
+    const float * knl_data = (const float *)kernel->data;
+
+    const int64_t rows_total = p.dst_h * p.batch;
+    const int64_t rows_per_thread = (rows_total + params->nth - 1) / params->nth;
+    const int64_t row_start = params->ith * rows_per_thread;
+    const int64_t row_end = MIN(row_start + rows_per_thread, rows_total);
+
+#ifdef GGML_SIMD
+    const int64_t pkg_size = GGML_F32_EPR;
+    const int64_t pkg_count = c / pkg_size;
+    const int64_t c_pkg_end = pkg_count * pkg_size;
+#else
+    const int64_t c_pkg_end = 0;
+#endif
+
+    for (int64_t row = row_start; row < row_end; ++row) {
+        const int64_t dst_y = row % p.dst_h;
+        const float * src_data = (const float *)src->data + (row / p.dst_h) * p.src_w * p.src_h * c;
+        for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
+            float * dst_data = (float *)dst->data + (row * p.dst_w + dst_x) * c;
+            const int64_t src_y_base = dst_y * p.stride_y - p.pad_y;
+            const int64_t src_x_base = dst_x * p.stride_x - p.pad_x;
+
+#ifdef GGML_SIMD
+            // Vectorized loop
+            for (int64_t c_i = 0; c_i < c_pkg_end; c_i += pkg_size) {
+                GGML_F32_VEC sum = GGML_F32_VEC_ZERO;
+                for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+                    const int64_t src_y = src_y_base + knl_y * p.dilation_y;
+                    if (src_y < 0 || src_y >= p.src_h) {
+                        continue;
+                    }
+                    for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                        const int64_t src_x = src_x_base + knl_x * p.dilation_x;
+                        if (src_x < 0 || src_x >= p.src_w) {
+                            continue;
+                        }
+                        GGML_F32_VEC k = GGML_F32_VEC_LOAD(knl_data + (knl_y * p.knl_w + knl_x) * c + c_i);
+                        GGML_F32_VEC s = GGML_F32_VEC_LOAD(src_data + (src_y * p.src_w + src_x) * c + c_i);
+                        sum = GGML_F32_VEC_FMA(sum, k, s);
+                    }
+                }
+                GGML_F32_VEC_STORE(dst_data + c_i, sum);
+            }
+#endif
+            // Scalar loop
+            for (int64_t c_i = c_pkg_end; c_i < c; ++c_i) {
+                float sum = 0.0f;
+                for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+                    const int64_t src_y = src_y_base + knl_y * p.dilation_y;
+                    if (src_y < 0 || src_y >= p.src_h) {
+                        continue;
+                    }
+                    for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                        const int64_t src_x = src_x_base + knl_x * p.dilation_x;
+                        if (src_x < 0 || src_x >= p.src_w) {
+                            continue;
+                        }
+                        sum += knl_data[(knl_y * p.knl_w + knl_x) * c + c_i]
+                             * src_data[(src_y * p.src_w + src_x) * c + c_i];
+                    }
+                }
+                dst_data[c_i] = sum;
+            }
+        }
+    }
+}
+
+static void ggml_compute_forward_conv_2d_dw_whcn(
+        const ggml_compute_params * params,
+        const ggml_tensor * src,
+        const ggml_tensor * kernel,
+        ggml_tensor * dst,
+        const ggml_conv_2d_dw_params & p) {
+
+    const int64_t n = p.channels * p.batch;
+    const int64_t per_thread = (n + params->nth - 1) / params->nth;
+    const int64_t start = params->ith * per_thread;
+    const int64_t end = MIN(start + per_thread, n);
+
+    for (int64_t i = start; i < end; ++i) {
+        const float * knl_data = (const float *)kernel->data + (i % p.channels) * p.knl_w * p.knl_h;
+        const float * src_data = (const float *)src->data + i * p.src_w * p.src_h;
+        float * dst_data = (float *)dst->data + i * p.dst_w * p.dst_h;
+
+        for (int64_t dst_y = 0; dst_y < p.dst_h; ++dst_y) {
+            for (int64_t dst_x = 0; dst_x < p.dst_w; ++dst_x) {
+
+                float sum = 0.0f;
+                for (int64_t knl_y = 0; knl_y < p.knl_h; ++knl_y) {
+                    const int64_t src_y = dst_y * p.stride_y + knl_y * p.dilation_y - p.pad_y;
+                    if (src_y < 0 || src_y >= p.src_h) {
+                        continue;
+                    }
+                    for (int64_t knl_x = 0; knl_x < p.knl_w; ++knl_x) {
+                        const int64_t src_x = dst_x * p.stride_x + knl_x * p.dilation_x - p.pad_x;
+                        if (src_x < 0 || src_x >= p.src_w) {
+                            continue;
+                        }
+                        sum += knl_data[knl_y * p.knl_w + knl_x]
+                             * src_data[src_y * p.src_w + src_x];
+                    }
+                }
+                dst_data[dst_y * p.dst_w + dst_x] = sum;
+            }
+        }
+    }
+}
+
+void ggml_compute_forward_conv_2d_dw(
+        const ggml_compute_params * params,
+        ggml_tensor * dst) {
+
+    const ggml_tensor * kernel = dst->src[0];
+    const ggml_tensor * src = dst->src[1];
+    ggml_conv_2d_dw_params p;
+    p.channels = src->ne[2];
+    p.batch = src->ne[3];
+    p.src_w = src->ne[0];
+    p.src_h = src->ne[1];
+    p.dst_w = dst->ne[0];
+    p.dst_h = dst->ne[1];
+    p.knl_w = kernel->ne[0];
+    p.knl_h = kernel->ne[1];
+    p.stride_x = dst->op_params[0];
+    p.stride_y = dst->op_params[1];
+    p.pad_x = dst->op_params[2];
+    p.pad_y = dst->op_params[3];
+    p.dilation_x = dst->op_params[4];
+    p.dilation_y = dst->op_params[5];
+
+    GGML_ASSERT(kernel->ne[3] == p.channels);
+    GGML_ASSERT(dst->ne[3] == p.batch);
+
+    if (ggml_is_contiguous(src)) {
+        ggml_compute_forward_conv_2d_dw_whcn(params, src, kernel, dst, p);
+    } else if (ggml_is_contiguous_channels(src)) {
+        // kernel should also have channels most contiguous in memory
+        GGML_ASSERT(kernel->nb[0] >= kernel->nb[2] && kernel->nb[1] >= kernel->nb[0]);
+        ggml_compute_forward_conv_2d_dw_cwhn(params, src, kernel, dst, p);
+    } else {
+        GGML_ABORT("non-contiguous memory layout not supported");
+    }
+}
+
 // ggml_compute_forward_pool_1d_sk_p0
 
 static void ggml_compute_forward_pool_1d_sk_p0(
diff --git a/ggml/src/ggml-cpu/ops.h b/ggml/src/ggml-cpu/ops.h
index 410a372047a01..dc081b9e66397 100644
--- a/ggml/src/ggml-cpu/ops.h
+++ b/ggml/src/ggml-cpu/ops.h
@@ -65,6 +65,7 @@ void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * p
 void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
+void ggml_compute_forward_conv_2d_dw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
 void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 950772c75cb32..c8b2feff4251d 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -956,6 +956,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CONV_TRANSPOSE_1D",
     "IM2COL",
     "IM2COL_BACK",
+    "CONV_2D_DW",
     "CONV_TRANSPOSE_2D",
     "POOL_1D",
     "POOL_2D",
@@ -993,7 +994,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "OPT_STEP_ADAMW",
 };
 
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1050,6 +1051,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "conv_transpose_1d(x)",
     "im2col(x)",
     "im2col_back(x)",
+    "conv_2d_dw(x)",
     "conv_transpose_2d(x)",
     "pool_1d(x)",
     "pool_2d(x)",
@@ -1087,7 +1089,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "adamw(x)",
 };
 
-static_assert(GGML_OP_COUNT == 81, "GGML_OP_COUNT != 81");
+static_assert(GGML_OP_COUNT == 82, "GGML_OP_COUNT != 82");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -1344,6 +1346,13 @@ bool ggml_is_permuted(const struct ggml_tensor * tensor) {
     return tensor->nb[0] > tensor->nb[1] || tensor->nb[1] > tensor->nb[2] || tensor->nb[2] > tensor->nb[3];
 }
 
+bool ggml_is_contiguous_channels(const struct ggml_tensor * tensor) {
+    return
+        tensor->nb[0] > tensor->nb[2] &&
+        tensor->nb[1] > tensor->nb[0] &&
+        tensor->nb[2] == ggml_type_size(tensor->type);
+}
+
 static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
@@ -4050,6 +4059,46 @@ struct ggml_tensor * ggml_conv_2d_dw(
     return result;
 }
 
+// ggml_conv_2d_dw_direct
+
+struct ggml_tensor * ggml_conv_2d_dw_direct(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   stride0,
+        int                   stride1,
+        int                   pad0,
+        int                   pad1,
+        int                   dilation0,  
+        int                   dilation1) {
+    GGML_ASSERT(a->ne[2] == 1);
+    GGML_ASSERT(a->ne[3] == b->ne[2]);
+    int64_t ne[4];
+    ne[0] = ggml_calc_conv_output_size(b->ne[0], a->ne[0], stride0, pad0, dilation0);
+    ne[1] = ggml_calc_conv_output_size(b->ne[1], a->ne[1], stride1, pad1, dilation1);
+    ne[2] = b->ne[2];
+    ne[3] = b->ne[3];
+
+    struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);
+
+    if (ggml_is_contiguous_channels(b)) {
+        // Result will be permuted the same way as input (CWHN order)
+        const int64_t type_size = ggml_type_size(result->type);
+        GGML_ASSERT(ggml_blck_size(result->type) == 1);
+        result->nb[0] = result->ne[2] * type_size;
+        result->nb[1] = result->ne[0] * result->nb[0];
+        result->nb[2] = type_size;
+    }
+
+    int32_t params[] = { stride0, stride1, pad0, pad1, dilation0, dilation1 };
+    ggml_set_op_params(result, params, sizeof(params));
+
+    result->op     = GGML_OP_CONV_2D_DW;
+    result->src[0] = a;
+    result->src[1] = b;
+    return result;
+}
+
 // ggml_conv_transpose_2d_p0
 
 static int64_t ggml_calc_conv_transpose_output_size(int64_t ins, int64_t ks, int s, int p) {

From 63b4911494afe04778c61b9c19019341d71c99fc Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 24 Apr 2025 16:47:43 +0300
Subject: [PATCH 025/200] sync : ggml

ggml-ci
---
 scripts/sync-ggml.last | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last
index cad082a902c44..41feffca9212d 100644
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-f71d538ece3fb32a04824dc6d1e73e360be9d22f
+13bcf9ce50651a8b4238ec6d136f46f2c1b23b6f

From 87616f0680947800ecba3e9f6bc6e101943bf8e6 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 24 Apr 2025 17:22:27 +0300
Subject: [PATCH 026/200] ggml : fix trailing whitespaces (#0)

---
 ggml/src/ggml.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index c8b2feff4251d..2a39dc7bfd125 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4069,7 +4069,7 @@ struct ggml_tensor * ggml_conv_2d_dw_direct(
         int                   stride1,
         int                   pad0,
         int                   pad1,
-        int                   dilation0,  
+        int                   dilation0,
         int                   dilation1) {
     GGML_ASSERT(a->ne[2] == 1);
     GGML_ASSERT(a->ne[3] == b->ne[2]);

From 226251ed56b85190e18a1cca963c45b888f4953c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Thu, 24 Apr 2025 22:29:22 +0300
Subject: [PATCH 027/200] embeddings : fix batch sizes (#13076)

ggml-ci
---
 examples/embedding/embedding.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 6f08904159fd5..06fce236e2b85 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -89,6 +89,13 @@ int main(int argc, char ** argv) {
     common_init();
 
     params.embedding = true;
+
+    // utilize the full context
+    if (params.n_batch < params.n_ctx) {
+        LOG_WRN("%s: setting batch size to %d\n", __func__, params.n_ctx);
+        params.n_batch = params.n_ctx;
+    }
+
     // For non-causal models, batch size must be equal to ubatch size
     params.n_ubatch = params.n_batch;
 
@@ -134,7 +141,6 @@ int main(int argc, char ** argv) {
 
     // max batch size
     const uint64_t n_batch = params.n_batch;
-    GGML_ASSERT(params.n_batch >= params.n_ctx);
 
     // tokenize the prompts and trim
     std::vector<std::vector<int32_t>> inputs;

From 13be08daf992c89d5169518229b3740041c0f419 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Thu, 24 Apr 2025 22:17:04 +0200
Subject: [PATCH 028/200] clip : remove boi/eoi embeddings for GLM-edge model
 (#13081)

---
 examples/llava/clip-impl.h |  2 --
 examples/llava/clip.cpp    | 17 +----------------
 examples/llava/mtmd.cpp    |  5 +++++
 3 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h
index 8d310fb0271c5..53ac381304765 100644
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@@ -90,8 +90,6 @@
 #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
 #define TN_GLM_ADAPTER_GATE     "adapter.linear.gate.%s"
 #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
-#define TN_GLM_BOI_W            "adapter.boi"
-#define TN_GLM_EOI_W            "adapter.eoi"
 
 enum projector_type {
     PROJECTOR_TYPE_MLP,
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 4eec4a2646798..9a5ab7c819585 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -244,8 +244,6 @@ struct clip_vision_model {
     //GLMV-Edge projection
     struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
     struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
-    struct ggml_tensor * boi_w = nullptr;
-    struct ggml_tensor * eoi_w = nullptr;
 
     // MobileVLM projection
     struct ggml_tensor * mm_model_mlp_1_w = nullptr;
@@ -1697,8 +1695,6 @@ struct clip_model_loader {
                     vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
                     vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
                     vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
-                    vision_model.boi_w = get_tensor(TN_GLM_BOI_W);
-                    vision_model.eoi_w = get_tensor(TN_GLM_EOI_W);
                 } break;
             case PROJECTOR_TYPE_MERGER:
                 {
@@ -2593,8 +2589,7 @@ void clip_free(clip_ctx * ctx) {
 }
 
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
-    int extra_tokens = ctx->has_glm_projector ? 2 : 0;
-    return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
+    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
 
 size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
@@ -2790,9 +2785,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     }
     if (ctx->has_glm_projector) {
         GGML_ASSERT(batch_size == 1);
-        ggml_tensor * boi = ctx->vision_model.boi_w;
-        ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
-        vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
     }
 
     // build the inference graph
@@ -3001,13 +2993,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     // copy the embeddings to the location passed by the user
     ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
 
-    if (ctx->has_glm_projector) {
-        //eoi
-        ggml_tensor * eoi = ctx->vision_model.eoi_w;
-        int offset = ggml_nelements(embeddings);
-        ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
-    }
-
     return true;
 }
 
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index 11ca7b30f1ac6..a994ef0166e6a 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -186,6 +186,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
         marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
 
+    } else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+        // <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
+        marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>";
+        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+
     } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
         // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
         marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";

From 553a5c3a9fdf771be2101bc3529937963f817457 Mon Sep 17 00:00:00 2001
From: Radoslav Gerganov <rgerganov@gmail.com>
Date: Fri, 25 Apr 2025 10:08:08 +0300
Subject: [PATCH 029/200] rpc : do not wait for response when sending
 RPC_CMD_SET_TENSOR (#12943)

RPC_CMD_SET_TENSOR always returns an empty response and we send this 4
times per token. We can improve TG speed if we don't wait for this empty
response.

The performance impact of this change depends on the network latency.
---
 ggml/include/ggml-rpc.h        |  2 +-
 ggml/src/ggml-rpc/ggml-rpc.cpp | 18 ++++++++++++------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/ggml/include/ggml-rpc.h b/ggml/include/ggml-rpc.h
index c8b6097f7e573..1e674112767c9 100644
--- a/ggml/include/ggml-rpc.h
+++ b/ggml/include/ggml-rpc.h
@@ -7,7 +7,7 @@
 extern "C" {
 #endif
 
-#define RPC_PROTO_MAJOR_VERSION    1
+#define RPC_PROTO_MAJOR_VERSION    2
 #define RPC_PROTO_MINOR_VERSION    0
 #define RPC_PROTO_PATCH_VERSION    0
 #define GGML_RPC_MAX_SERVERS       16
diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index a0667b7d702b2..9023eb0919690 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -378,8 +378,8 @@ static bool parse_endpoint(const std::string & endpoint, std::string & host, int
 }
 
 // RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
-// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
-static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
+// No response
+static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size) {
     uint8_t cmd_byte = cmd;
     if (!send_data(sock->fd, &cmd_byte, sizeof(cmd_byte))) {
         return false;
@@ -390,6 +390,15 @@ static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cm
     if (!send_data(sock->fd, input, input_size)) {
         return false;
     }
+    return true;
+}
+
+// RPC request : | rpc_cmd (1 byte) | request_size (8 bytes) | request_data (request_size bytes) |
+// RPC response: | response_size (8 bytes) | response_data (response_size bytes) |
+static bool send_rpc_cmd(const std::shared_ptr<socket_t> & sock, enum rpc_cmd cmd, const void * input, size_t input_size, void * output, size_t output_size) {
+    if (!send_rpc_cmd(sock, cmd, input, input_size)) {
+        return false;
+    }
     // TODO: currently the output_size is always known, do we need support for commands with variable output size?
     // even if we do, we can skip sending output_size from the server for commands with known output size
     uint64_t out_size;
@@ -555,7 +564,7 @@ static void ggml_backend_rpc_buffer_set_tensor(ggml_backend_buffer_t buffer, ggm
     memcpy(input.data(), &rpc_tensor, sizeof(rpc_tensor));
     memcpy(input.data() + sizeof(rpc_tensor), &offset, sizeof(offset));
     memcpy(input.data() + sizeof(rpc_tensor) + sizeof(offset), data, size);
-    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size(), nullptr, 0);
+    bool status = send_rpc_cmd(ctx->sock, RPC_CMD_SET_TENSOR, input.data(), input.size());
     GGML_ASSERT(status);
 }
 
@@ -1428,9 +1437,6 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
                 if (!server.set_tensor(input)) {
                     return;
                 }
-                if (!send_msg(sockfd, nullptr, 0)) {
-                    return;
-                }
                 break;
             }
             case RPC_CMD_SET_TENSOR_HASH: {

From 514c45608f93f66106a712dee1abe062099ce790 Mon Sep 17 00:00:00 2001
From: Neo Zhang Jianyu <jianyu.zhang@intel.com>
Date: Fri, 25 Apr 2025 17:37:51 +0800
Subject: [PATCH 030/200] change the reorder tensor from init to execute OP
 (#13003)

---
 ggml/src/ggml-sycl/common.hpp    |   1 -
 ggml/src/ggml-sycl/ggml-sycl.cpp | 125 +++++++++++++++----------------
 2 files changed, 61 insertions(+), 65 deletions(-)

diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 96becabc85ae5..0ab0fb0aa394d 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -313,7 +313,6 @@ struct ggml_backend_sycl_context {
     int device;
     std::string name;
     optimize_feature opt_feature;
-    bool optimized_graph=false;
 
     queue_ptr qptrs[GGML_SYCL_MAX_DEVICES][GGML_SYCL_MAX_STREAMS] = { { nullptr } };
 
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 8081a77b74f67..548f2d0a06be0 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -192,7 +192,7 @@ static void ggml_check_sycl() try {
 
     if (!initialized) {
         g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
-        g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 1);
+        g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
         g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
         GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
         GGML_LOG_INFO("Running with Environment Variables:\n");
@@ -2852,6 +2852,64 @@ static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
     }
 }
 
+static void reorder_qw(char *data_device, const int ncols, const int nrows,
+                size_t size, size_t offset, dpct::queue_ptr stream) {
+    auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
+    SYCL_CHECK(
+        CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
+            .wait()));
+    GGML_ASSERT((size % sizeof(block_q4_0) == 0));
+    GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
+    int offset_blks = offset / sizeof(block_q4_0);
+    auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
+    auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
+
+    stream->parallel_for(
+        size / sizeof(block_q4_0),
+            [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
+            const block_q4_0* x = (const block_q4_0*)tmp_buf;
+            const int ib = i;
+
+            for (int j = 0; j < QK4_0/2; j ++)
+            {
+                *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
+            }
+            *(d_ptr + ib) = x[ib].d;
+        });
+
+    sycl::free(tmp_buf, *stream);
+}
+
+static void reorder_qw(const ggml_tensor * src0, dpct::queue_ptr stream) {
+    char*data_device = (char*)src0->data;
+    size_t ncols = src0->ne[0];
+    size_t nrows = src0->ne[1];
+    size_t size = ggml_nbytes(src0);
+
+    reorder_qw(data_device, ncols, nrows, size, 0, stream);
+}
+
+/*
+* This function could be called when the OP (mul_mat) function support reorder optimizition.
+*/
+static void opt_for_reorder(ggml_backend_sycl_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1,
+    ggml_tensor * dst) {
+    if (!g_ggml_sycl_disable_optimize && //allow optimize, controlled by $GGML_SYCL_DISABLE_OPT
+        ctx->opt_feature.reorder &&      //allow this device due to good perf, skip the devices with bad perf.
+        dst->op == GGML_OP_MUL_MAT &&    //limit to some supported cases of Q4_0, to do for more cases.
+        src0->type == GGML_TYPE_Q4_0 &&
+        src1->ne[2]==1 && src1->ne[3]==1) {
+
+        ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
+        if (!extra) return; //only happen in CI/UT permute case.
+
+        if (extra->optimized_feature.reorder) return; //skip the tensor which is handled for reorder.
+
+        reorder_qw(src0, ctx->stream());
+        extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
+    }
+}
+
 static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
 
     const bool split = ggml_backend_buffer_is_sycl_split(src0->buffer);
@@ -2914,6 +2972,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
         // KQ + KQV multi-batch
         ggml_sycl_mul_mat_batched_sycl(ctx, src0, src1, dst);
     } else if (use_dequantize_mul_mat_vec) {
+        opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_dequantize_mul_mat_vec, false);
         // save_tensor_txt("1/dst_1.txt", (float*) dst->data, src0->ne[1], sizeof(float), ctx.stream());
     } else if (use_mul_mat_vec_q) {
@@ -2921,6 +2980,7 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
     } else if (use_mul_mat_q) {
         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_q, true);
     } else {
+        opt_for_reorder(&ctx, src0, src1, dst); //the OP function in this branch support reorder.
         ggml_sycl_op_mul_mat(ctx, src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false);
     }
 }
@@ -3545,71 +3605,8 @@ catch (sycl::exception const &exc) {
   std::exit(1);
 }
 
-static void reorder_qw(char *data_device, const int ncols, const int nrows,
-                size_t size, size_t offset, dpct::queue_ptr stream) {
-    auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
-    SYCL_CHECK(
-        CHECK_TRY_ERROR((*stream).memcpy(tmp_buf, data_device, size)
-            .wait()));
-    GGML_ASSERT((size % sizeof(block_q4_0) == 0));
-    GGML_ASSERT((offset % sizeof(block_q4_0) == 0));
-    int offset_blks = offset / sizeof(block_q4_0);
-    auto qs_ptr = (uint8_t*)data_device + offset_blks * QK4_0 / 2;;
-    auto d_ptr = (sycl::half*)(qs_ptr + ncols * nrows / 2) + offset_blks;
-
-    stream->parallel_for(
-        size / sizeof(block_q4_0),
-            [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
-            const block_q4_0* x = (const block_q4_0*)tmp_buf;
-            const int ib = i;
-
-            for (int j = 0; j < QK4_0/2; j ++)
-            {
-                *(qs_ptr + ib * QK4_0 / 2 + j) = x[ib].qs[j];
-            }
-            *(d_ptr + ib) = x[ib].d;
-        });
-
-    sycl::free(tmp_buf, *stream);
-}
-
-static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
-    char*data_device = (char*)src0->data;
-    size_t ncols = src0->ne[0];
-    size_t nrows = src0->ne[1];
-    size_t size = ggml_nbytes(src0);
-
-    reorder_qw(data_device, ncols, nrows, size, 0, stream);
-}
-
-static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
-    ggml_tensor *src0 = dst->src[0];
-    ggml_tensor *src1 = dst->src[1];
-
-    if (dst->op == GGML_OP_MUL_MAT && src0->type == GGML_TYPE_Q4_0 &&
-        src1->ne[2]==1 && src1->ne[3]==1) {
-        reorder_qw(src0, stream);
-        ggml_tensor_extra_gpu* extra = (ggml_tensor_extra_gpu*)src0->extra;
-        GGML_ASSERT(extra);
-        extra->optimized_feature.reorder = true; //used to decode/dequan in next steps.
-    }
-}
-
-static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
-    dpct::queue_ptr stream = ctx->stream();
-    if (ctx->optimized_graph) {
-       return;
-    }
-    ctx->optimized_graph = true;
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
-    }
-}
-
 static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
     ggml_sycl_set_main_device(sycl_ctx->device);
-    if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
 
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];

From edb18b6e8f5ea6509ad43057f8bb98fc557dbc4e Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Fri, 25 Apr 2025 14:31:42 +0200
Subject: [PATCH 031/200] clip : fix pixtral on some GPU backends (#13097)

* clip : fix pixtral on some GPU backends

* refactor inp_raw set

* rm outdated comment

* fix dynamic size

* add TODO
---
 examples/llava/clip.cpp    | 68 ++++++++++++++++++++++++++------------
 tests/test-backend-ops.cpp |  2 ++
 2 files changed, 48 insertions(+), 22 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 9a5ab7c819585..da8a590f0e563 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -554,15 +554,15 @@ static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_im
 }
 
 // implementation of the 2D RoPE without adding a new op in ggml
+// this is not efficient (use double the memory), but works on all backends
+// TODO: there was a more efficient which relies on ggml_view and ggml_rope_ext_inplace, but the rope inplace does not work well with non-contiguous tensors ; we should fix that and revert back to the original implementation in https://github.com/ggml-org/llama.cpp/pull/13065
 static ggml_tensor * build_rope_2d(
-    ggml_cgraph * gf,
     ggml_context * ctx0,
     ggml_tensor * cur,
     ggml_tensor * pos_h,
     ggml_tensor * pos_w,
     const float freq_base
 ) {
-    ggml_tensor * tmp;
     const int64_t n_dim  = cur->ne[0];
     const int64_t n_head = cur->ne[1];
     const int64_t n_pos  = cur->ne[2];
@@ -571,18 +571,23 @@ static ggml_tensor * build_rope_2d(
     // we will have a list of 4 inv_freq: 1e-0, 1e-1, 1e-2, 1e-3
     // first half of cur will use 1e-0, 1e-2 (even)
     // second half of cur will use 1e-1, 1e-3 (odd)
-    //
-    // for the first half, the trick here is to rotate n_dim/2, so inv_freq will be even
+    // the trick here is to rotate just half of n_dim, so inv_freq will automatically be even
     //  ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
     // then for the second half, we use freq_scale to shift the inv_freq
     //  ^ why? replace (2i) with (2i+1) in the above equation
     const float freq_scale_odd = std::pow(freq_base, (float)-2/n_dim);
 
     // first half
+    ggml_tensor * first;
     {
-        cur = ggml_rope_ext_inplace(
+        first = ggml_view_3d(ctx0, cur,
+            n_dim/2, n_head, n_pos,
+            ggml_row_size(cur->type, n_dim),
+            ggml_row_size(cur->type, n_dim*n_head),
+            0);
+        first = ggml_rope_ext(
             ctx0,
-            cur,
+            first,
             pos_h,      // positions
             nullptr,    // freq factors
             n_dim/2,    // n_dims
@@ -592,15 +597,17 @@ static ggml_tensor * build_rope_2d(
     }
 
     // second half
+    ggml_tensor * second;
     {
-        tmp = ggml_view_3d(ctx0, cur,
+        second = ggml_view_3d(ctx0, cur,
             n_dim/2, n_head, n_pos,
             ggml_row_size(cur->type, n_dim),
             ggml_row_size(cur->type, n_dim*n_head),
             n_dim/2 * ggml_element_size(cur));
-        tmp = ggml_rope_ext_inplace(
+        second = ggml_cont(ctx0, second); // copy, because ggml_rope don't play well with non-contiguous tensors
+        second = ggml_rope_ext(
             ctx0,
-            tmp,
+            second,
             pos_w,      // positions
             nullptr,    // freq factors
             n_dim/2,    // n_dims
@@ -608,10 +615,9 @@ static ggml_tensor * build_rope_2d(
             freq_scale_odd,
             0.0f, 1.0f, 0.0f, 0.0f
         );
-        // calculate inplace (modify cur directly)
-        ggml_build_forward_expand(gf, tmp);
     }
 
+    cur = ggml_concat(ctx0, first, second, 0);
     return cur;
 }
 
@@ -680,13 +686,13 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
             struct ggml_tensor * Q = ggml_mul_mat(ctx0, model.layers[il].q_w, cur);
 
             Q = ggml_reshape_3d(ctx0, Q, d_head, n_head, num_patches);
-            Q = build_rope_2d(gf, ctx0, Q, pos_h, pos_w, hparams.rope_theta);
+            Q = build_rope_2d(ctx0, Q, pos_h, pos_w, hparams.rope_theta);
             Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
 
             struct ggml_tensor * K = ggml_mul_mat(ctx0, model.layers[il].k_w, cur);
 
             K = ggml_reshape_3d(ctx0, K, d_head, n_head, num_patches);
-            K = build_rope_2d(gf, ctx0, K, pos_h, pos_w, hparams.rope_theta);
+            K = build_rope_2d(ctx0, K, pos_h, pos_w, hparams.rope_theta);
             K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
 
             struct ggml_tensor * V = ggml_mul_mat(ctx0, model.layers[il].v_w, cur);
@@ -2796,10 +2802,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     const auto & model = ctx->vision_model;
     const auto & hparams = model.hparams;
 
+    // TODO @ngxson : this is ugly, need to refactor later
+    bool support_dynamic_size = ctx->has_minicpmv_projector
+        || ctx->has_qwen2vl_merger
+        || ctx->proj_type == PROJECTOR_TYPE_PIXTRAL;
+
     const int image_size = hparams.image_size;
     int image_size_width  = image_size;
     int image_size_height = image_size;
-    if (ctx->has_minicpmv_projector | ctx->has_qwen2vl_merger) {
+    if (support_dynamic_size) {
         image_size_width  = imgs.entries[0]->nx;
         image_size_height = imgs.entries[0]->ny;
     }
@@ -2811,9 +2822,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
 
     {
         struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
-        float * data = (float *)malloc(ggml_nbytes(inp_raw));
+        std::vector<float> inp_data(ggml_nelements(inp_raw));
+        float * data = inp_data.data();
+
+        // layout of data (note: the channel dim is unrolled to better visualize the layout):
+        //
+        // ┌──W──┐
+        // │     H │  channel = R
+        // ├─────┤ │
+        // │     H │  channel = G
+        // ├─────┤ │
+        // │     H │  channel = B
+        // └─────┘ │
+        //   ──────┘ x B
 
-        // TODO @ngxson : this whole code block is ugly, will need to be refactored
         for (size_t i = 0; i < imgs.entries.size(); i++) {
             const int nx = imgs.entries[i]->nx;
             const int ny = imgs.entries[i]->ny;
@@ -2828,17 +2850,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             const int n = nx * ny;
 
             for (int b = 0; b < batch_size; b++) {
-                for (int k = 0; k < 3; k++) {
-                    for (int y = 0; y < ny; y++) {
-                        for (int x = 0; x < nx; x++) {
-                            data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries[b]->buf[3 * (y * nx + x) + k];
-                        }
+                float * batch_entry = data + b * (3*n);
+                for (int y = 0; y < ny; y++) {
+                    for (int x = 0; x < nx; x++) {
+                        size_t base_src = 3*(y * nx + x); // idx of the first channel
+                        size_t base_dst =    y * nx + x;  // idx of the first channel
+                        batch_entry[      base_dst] = imgs.entries[b]->buf[base_src    ];
+                        batch_entry[1*n + base_dst] = imgs.entries[b]->buf[base_src + 1];
+                        batch_entry[2*n + base_dst] = imgs.entries[b]->buf[base_src + 2];
                     }
                 }
             }
         }
         ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
-        free(data);
     }
     if (ctx->has_minicpmv_projector) {
         {
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 61751755b317b..d70acb7719435 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -2606,6 +2606,8 @@ struct test_rope : public test_case {
             } else {
                 out = ggml_rope_ext_back(ctx, a, pos, freq, n_dims, mode, 0, 10000.0f, fs, ef, af, 1.0f, 1.0f);
             }
+
+            // TODO: add test with a non-contiguous view as input ; this case is needed for build_rope_2d in clip.cpp
         }
         ggml_set_name(out, "out");
 

From 558a764713468f26f5a163d25a22100c9a04a48f Mon Sep 17 00:00:00 2001
From: City <125218114+city96@users.noreply.github.com>
Date: Fri, 25 Apr 2025 14:38:34 +0200
Subject: [PATCH 032/200] Force FP32 compute in GLM4 FFN Down (#13101)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Force FP32 compute in cuBLAS GEMM

* Revert "Force FP32 compute in cuBLAS GEMM"

This reverts commit 6efd872732159ab88ee7b3c1d77ba5ebc83079bd.

* Force F32 compute in GLM4 ffn down

* Edit comment to clarify issue

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 src/llama-graph.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index a85e97288e1ae..b52e3f6203a4b 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -803,6 +803,10 @@ ggml_tensor * llm_graph_context::build_ffn(
 
     if (down) {
         cur = build_lora_mm(down, cur);
+        if (arch == LLM_ARCH_GLM4) {
+            // GLM4 seems to have numerical issues with half-precision accumulators
+            ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
+        }
     }
 
     if (down_b) {

From 295354ea6848a77bdee204ee1c971d9b92ffcca9 Mon Sep 17 00:00:00 2001
From: Diego Devesa <slarengh@gmail.com>
Date: Fri, 25 Apr 2025 19:40:11 +0200
Subject: [PATCH 033/200] llama : fix K-shift with quantized K and BLAS backend
 (#13113)

---
 src/llama-context.cpp | 17 +++--------------
 src/llama-context.h   |  3 +--
 2 files changed, 4 insertions(+), 16 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 983385f86d494..a52b6850b465d 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
         ggml_tensor * shift,
         ggml_tensor * factors,
               float   freq_base,
-              float   freq_scale,
-        ggml_backend_buffer * bbuf) const {
+              float   freq_scale) const {
     const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
 
     const auto & yarn_ext_factor  = cparams.yarn_ext_factor;
@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
         // dequantize to f32 -> RoPE -> quantize back
         tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
 
-        if (bbuf) {
-            for (const auto & backend : backends) {
-                // Figure out which backend KV cache belongs to
-                if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
-                    ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
-                    break;
-                }
-            }
-        }
-
-        tmp = ggml_rope_ext_inplace(ctx0, tmp,
+        tmp = ggml_rope_ext(ctx0, tmp,
                 shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
                 yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
 
@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
                 ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
                 0);
 
-        ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer);
+        ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
 
         ggml_build_forward_expand(gf, cur);
     }
diff --git a/src/llama-context.h b/src/llama-context.h
index 04facb544cb1a..5457f077c15bf 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -170,8 +170,7 @@ struct llama_context {
         ggml_tensor * shift,
         ggml_tensor * factors,
               float   freq_base,
-              float   freq_scale,
-        ggml_backend_buffer * bbuf) const;
+              float   freq_scale) const;
 
     llm_graph_result_ptr build_kv_self_shift(
             ggml_context * ctx0,

From d5fe4e81bd447124836ecfb47d794f8768665b9f Mon Sep 17 00:00:00 2001
From: frob <rick+github@frob.com.au>
Date: Sat, 26 Apr 2025 10:10:20 +0200
Subject: [PATCH 034/200] grammar : handle maxItems == 0 in JSON schema
 (#13117)

Co-authored-by: Richard Lyons <frob@cloudstaff.com>
---
 common/json-schema-to-grammar.cpp                |  3 +++
 examples/json_schema_to_grammar.py               |  3 +++
 .../public_legacy/json-schema-to-grammar.mjs     |  3 +++
 tests/test-json-schema-to-grammar.cpp            | 16 ++++++++++++++++
 4 files changed, 25 insertions(+)

diff --git a/common/json-schema-to-grammar.cpp b/common/json-schema-to-grammar.cpp
index 9067982257120..5b3059c2f774f 100644
--- a/common/json-schema-to-grammar.cpp
+++ b/common/json-schema-to-grammar.cpp
@@ -16,6 +16,9 @@ using json = nlohmann::ordered_json;
 static std::string build_repetition(const std::string & item_rule, int min_items, int max_items, const std::string & separator_rule = "") {
     auto has_max = max_items != std::numeric_limits<int>::max();
 
+    if (max_items == 0) {
+        return "";
+    }
     if (min_items == 0 && max_items == 1) {
         return item_rule + "?";
     }
diff --git a/examples/json_schema_to_grammar.py b/examples/json_schema_to_grammar.py
index 55f94c0b0a864..ed379585546c2 100755
--- a/examples/json_schema_to_grammar.py
+++ b/examples/json_schema_to_grammar.py
@@ -10,6 +10,9 @@
 
 def _build_repetition(item_rule, min_items, max_items, separator_rule=None):
 
+    if max_items == 0:
+        return ""
+
     if min_items == 0 and max_items == 1:
         return f'{item_rule}?'
 
diff --git a/examples/server/public_legacy/json-schema-to-grammar.mjs b/examples/server/public_legacy/json-schema-to-grammar.mjs
index f767ce7b72008..b12bf2ab0909a 100644
--- a/examples/server/public_legacy/json-schema-to-grammar.mjs
+++ b/examples/server/public_legacy/json-schema-to-grammar.mjs
@@ -2,6 +2,9 @@
 const SPACE_RULE = '| " " | "\\n"{1,2} [ \\t]{0,20}';
 
 function _buildRepetition(itemRule, minItems, maxItems, opts={}) {
+  if (maxItems == 0) {
+    return '';
+  }
   if (minItems === 0 && maxItems === 1) {
     return `${itemRule}?`;
   }
diff --git a/tests/test-json-schema-to-grammar.cpp b/tests/test-json-schema-to-grammar.cpp
index e35134f3cb063..38cf01d6d8dfb 100755
--- a/tests/test-json-schema-to-grammar.cpp
+++ b/tests/test-json-schema-to-grammar.cpp
@@ -597,6 +597,22 @@ static void test_all(const std::string & lang, std::function<void(const TestCase
         )"""
     });
 
+    test({
+        SUCCESS,
+        "maxItems 0",
+        R"""({
+            "items": {
+                "type": "boolean"
+            },
+            "maxItems": 0
+        })""",
+        R"""(
+            boolean ::= ("true" | "false") space
+            root ::= "[" space  "]" space
+            space ::= | " " | "\n"{1,2} [ \t]{0,20}
+        )"""
+    });
+
     test({
         SUCCESS,
         "maxItems 1",

From 77d5e9a76a7b4a8a7c5bf9cf6ebef91860123cba Mon Sep 17 00:00:00 2001
From: SXX <sxx1136965276@gmail.com>
Date: Sat, 26 Apr 2025 22:05:31 +0800
Subject: [PATCH 035/200] ggml: move fp16/bf16 conversion optimizations to CPU
 backend + export conversion APIs (#13107)

* ggml: dynamic x86_64 feature detection for FP32 <-> FP16/BF16 conversion

* move fp converter to ggml-cpu

* Switch ggml_compute_forward_get_rows_f16/bf16 to new ggml_cpu_fp16/bf16_to_fp32
---
 ggml/include/ggml-cpu.h      |  5 ++
 ggml/src/ggml-cpu/ggml-cpu.c | 91 +++++++++++++++++++++++++++++++++++-
 ggml/src/ggml-cpu/ops.cpp    |  4 +-
 ggml/src/ggml.c              | 51 ++------------------
 4 files changed, 101 insertions(+), 50 deletions(-)

diff --git a/ggml/include/ggml-cpu.h b/ggml/include/ggml-cpu.h
index f5e11f1e10002..de77a875ec533 100644
--- a/ggml/include/ggml-cpu.h
+++ b/ggml/include/ggml-cpu.h
@@ -133,6 +133,11 @@ extern "C" {
 
     GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
 
+    GGML_BACKEND_API void ggml_cpu_fp32_to_fp16(const float *, ggml_fp16_t *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_fp16_to_fp32(const ggml_fp16_t *, float *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_fp32_to_bf16(const float *, ggml_bf16_t *, int64_t);
+    GGML_BACKEND_API void ggml_cpu_bf16_to_fp32(const ggml_bf16_t *, float *, int64_t);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c
index dbad8f61a1e92..64405449e2467 100644
--- a/ggml/src/ggml-cpu/ggml-cpu.c
+++ b/ggml/src/ggml-cpu/ggml-cpu.c
@@ -215,7 +215,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .nrows                    = 1,
     },
     [GGML_TYPE_F16] = {
-        .from_float               = (ggml_from_float_t) ggml_fp32_to_fp16_row,
+        .from_float               = (ggml_from_float_t) ggml_cpu_fp32_to_fp16,
         .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f16,
         .vec_dot_type             = GGML_TYPE_F16,
         .nrows                    = 1,
@@ -356,7 +356,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         .from_float               = quantize_row_q8_K,
     },
     [GGML_TYPE_BF16] = {
-        .from_float               = (ggml_from_float_t) ggml_fp32_to_bf16_row,
+        .from_float               = (ggml_from_float_t) ggml_cpu_fp32_to_bf16,
         .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_bf16,
         .vec_dot_type             = GGML_TYPE_BF16,
         .nrows                    = 1,
@@ -3166,6 +3166,93 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
     return ggml_graph_compute(cgraph, &cplan);
 }
 
+void ggml_cpu_fp32_to_fp16(const float * x, ggml_fp16_t * y, int64_t n) {
+    int64_t i = 0;
+#if defined(__F16C__)
+#if defined(__AVX512F__)
+    for (; i + 15 < n; i += 16) {
+        __m512 x_vec = _mm512_loadu_ps(x + i);
+        __m256i y_vec = _mm512_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm256_storeu_si256((__m256i *)(y + i), y_vec);
+    }
+#endif
+    for (; i + 7 < n; i += 8) {
+        __m256 x_vec = _mm256_loadu_ps(x + i);
+        __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storeu_si128((__m128i *)(y + i), y_vec);
+    }
+    for (; i + 3 < n; i += 4) {
+        __m128 x_vec = _mm_loadu_ps(x + i);
+        __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
+        _mm_storel_epi64((__m128i *)(y + i), y_vec);
+    }
+#endif
+    for (; i < n; ++i) {
+        y[i] = GGML_FP32_TO_FP16(x[i]);
+    }
+}
+
+void ggml_cpu_fp16_to_fp32(const ggml_fp16_t * x, float * y, int64_t n) {
+    int64_t i = 0;
+#if defined(__F16C__)
+#if defined(__AVX512F__)
+    for (; i + 15 < n; i += 16) {
+        __m256i x_vec = _mm256_loadu_si256((const __m256i *)(x + i));
+        __m512 y_vec = _mm512_cvtph_ps(x_vec);
+        _mm512_storeu_ps(y + i, y_vec);
+    }
+#endif
+    for (; i + 7 < n; i += 8) {
+        __m128i x_vec = _mm_loadu_si128((const __m128i *)(x + i));
+        __m256 y_vec = _mm256_cvtph_ps(x_vec);
+        _mm256_storeu_ps(y + i, y_vec);
+    }
+    for (; i + 3 < n; i += 4) {
+        __m128i x_vec = _mm_loadl_epi64((const __m128i *)(x + i));
+        __m128 y_vec = _mm_cvtph_ps(x_vec);
+        _mm_storeu_ps(y + i, y_vec);
+    }
+#endif
+    for (; i < n; ++i) {
+        y[i] = GGML_FP16_TO_FP32(x[i]);
+    }
+}
+
+void ggml_cpu_fp32_to_bf16(const float * x, ggml_bf16_t * y, int64_t n) {
+    int64_t i = 0;
+    for (; i < n; ++i) {
+        y[i] = GGML_FP32_TO_BF16(x[i]);
+    }
+}
+
+void ggml_cpu_bf16_to_fp32(const ggml_bf16_t * x, float * y, int64_t n) {
+    int64_t i = 0;
+#if defined(__AVX2__)
+#if defined(__AVX512F__)
+    for (; i + 15 < n; i += 16) {
+        _mm512_storeu_ps(y + i,
+                        _mm512_castsi512_ps(
+                            _mm512_slli_epi32(
+                                _mm512_cvtepu16_epi32(
+                                    _mm256_loadu_si256(
+                                        (const __m256i *)(x + i))),
+                                16)));
+    }
+#endif
+    for (; i + 7 < n; i += 8) {
+        _mm256_storeu_ps(y + i,
+                        _mm256_castsi256_ps(
+                            _mm256_slli_epi32(
+                                _mm256_cvtepu16_epi32(
+                                    _mm_loadu_si128(
+                                        (const __m128i *)(x + i))),
+                                16)));
+    }
+#endif
+    for (; i < n; i++) {
+        y[i] = GGML_BF16_TO_FP32(x[i]);
+    }
+}
 
 int ggml_cpu_has_avx(void) {
 #if defined(__AVX__)
diff --git a/ggml/src/ggml-cpu/ops.cpp b/ggml/src/ggml-cpu/ops.cpp
index 3c2adb217267b..7413192b746b6 100644
--- a/ggml/src/ggml-cpu/ops.cpp
+++ b/ggml/src/ggml-cpu/ops.cpp
@@ -4222,7 +4222,7 @@ static void ggml_compute_forward_get_rows_f16(
 
         GGML_ASSERT(i01 >= 0 && i01 < ne01);
 
-        ggml_fp16_to_fp32_row(
+        ggml_cpu_fp16_to_fp32(
             (const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
                        (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
     }
@@ -4263,7 +4263,7 @@ static void ggml_compute_forward_get_rows_bf16(
 
         GGML_ASSERT(i01 >= 0 && i01 < ne01);
 
-        ggml_bf16_to_fp32_row(
+        ggml_cpu_bf16_to_fp32(
             (const ggml_bf16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
                         (float *) ((char *)  dst->data + i10*nb1  + i11*nb2  + i12*nb3), nc);
     }
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 2a39dc7bfd125..7654ae1779b1d 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4,6 +4,7 @@
 #include "ggml-backend.h"
 #include "ggml-impl.h"
 #include "ggml-threading.h"
+#include "ggml-cpu.h"
 #include "ggml.h"
 
 // FIXME: required here for quantization functions
@@ -382,58 +383,16 @@ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n) {
     }
 }
 
-// FIXME: these functions must detect the instruction set at runtime, since they are part of the core ggml library
-//        currently, the ggml_cpu_has_* functions are entirely compile-time
 void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n) {
-    int64_t i = 0;
-#if defined(__F16C__)
-    //if (ggml_cpu_has_f16c()) {
-        for (; i + 7 < n; i += 8) {
-            __m256 x_vec = _mm256_loadu_ps(x + i);
-            __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-            _mm_storeu_si128((__m128i *)(y + i), y_vec);
-        }
-        for(; i + 3 < n; i += 4) {
-            __m128 x_vec = _mm_loadu_ps(x + i);
-            __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
-            _mm_storel_epi64((__m128i *)(y + i), y_vec);
-        }
-    //}
-#endif
-    for (; i < n; i++) {
+    int i = 0;
+    for (; i < n; ++i) {
         y[i] = GGML_FP32_TO_FP16(x[i]);
     }
 }
 
 void ggml_bf16_to_fp32_row(const ggml_bf16_t * x, float * y, int64_t n) {
-    int64_t i = 0;
-#if defined(__AVX512F__)
-    //if (ggml_cpu_has_avx512()) {
-        for (; i + 16 <= n; i += 16) {
-            _mm512_storeu_ps(y + i,
-                            _mm512_castsi512_ps(
-                                _mm512_slli_epi32(
-                                    _mm512_cvtepu16_epi32(
-                                        _mm256_loadu_si256(
-                                            (const __m256i *)(x + i))),
-                                    16)));
-        }
-    //}
-#endif
-#if defined(__AVX2__)
-    //if (ggml_cpu_has_avx2()) {
-        for (; i + 8 <= n; i += 8) {
-            _mm256_storeu_ps(y + i,
-                            _mm256_castsi256_ps(
-                                _mm256_slli_epi32(
-                                    _mm256_cvtepu16_epi32(
-                                        _mm_loadu_si128(
-                                            (const __m128i *)(x + i))),
-                                    16)));
-        }
-    //}
-#endif
-    for (; i < n; i++) {
+    int i = 0;
+    for (; i < n; ++i) {
         y[i] = GGML_BF16_TO_FP32(x[i]);
     }
 }

From 4753791e70acd4d4e02f2098f14a03df26c992bd Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Sat, 26 Apr 2025 22:39:47 +0200
Subject: [PATCH 036/200] clip : improve projector naming (#13118)

* clip : improve projector naming

* no more kv has_llava_projector

* rm unused kv

* rm more unused
---
 examples/llava/clip-impl.h |  27 +-
 examples/llava/clip.cpp    | 512 ++++++++++++++++---------------------
 examples/llava/clip.h      |   2 -
 3 files changed, 235 insertions(+), 306 deletions(-)

diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h
index 53ac381304765..16d0a8efc56ae 100644
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@@ -17,22 +17,15 @@
 #define KEY_FTYPE               "general.file_type"
 #define KEY_NAME                "general.name"
 #define KEY_DESCRIPTION         "general.description"
-#define KEY_HAS_TEXT_ENC        "clip.has_text_encoder"
-#define KEY_HAS_VIS_ENC         "clip.has_vision_encoder"
-#define KEY_HAS_LLAVA_PROJ      "clip.has_llava_projector"
-#define KEY_HAS_MINICPMV_PROJ   "clip.has_minicpmv_projector"
-#define KEY_HAS_GLM_PROJ        "clip.has_glm_projector"
 #define KEY_MINICPMV_VERSION    "clip.minicpmv_version"
-#define KEY_HAS_QWEN2VL_MERGER  "clip.has_qwen2vl_merger"
 #define KEY_USE_GELU            "clip.use_gelu"
 #define KEY_USE_SILU            "clip.use_silu"
-#define KEY_N_EMBD              "clip.%s.embedding_length"
-#define KEY_N_FF                "clip.%s.feed_forward_length"
-#define KEY_N_BLOCK             "clip.%s.block_count"
-#define KEY_N_HEAD              "clip.%s.attention.head_count"
-#define KEY_LAYER_NORM_EPS      "clip.%s.attention.layer_norm_epsilon"
-#define KEY_PROJ_DIM            "clip.%s.projection_dim"
-#define KEY_TOKENS              "tokenizer.ggml.tokens"
+#define KEY_N_EMBD              "clip.vision.embedding_length"
+#define KEY_N_FF                "clip.vision.feed_forward_length"
+#define KEY_N_BLOCK             "clip.vision.block_count"
+#define KEY_N_HEAD              "clip.vision.attention.head_count"
+#define KEY_LAYER_NORM_EPS      "clip.vision.attention.layer_norm_epsilon"
+#define KEY_PROJ_DIM            "clip.vision.projection_dim"
 #define KEY_IMAGE_SIZE          "clip.vision.image_size"
 #define KEY_PATCH_SIZE          "clip.vision.patch_size"
 #define KEY_IMAGE_MEAN          "clip.vision.image_mean"
@@ -96,9 +89,9 @@ enum projector_type {
     PROJECTOR_TYPE_MLP_NORM,
     PROJECTOR_TYPE_LDP,
     PROJECTOR_TYPE_LDPV2,
-    PROJECTOR_TYPE_RESAMPLER,
+    PROJECTOR_TYPE_MINICPMV,
     PROJECTOR_TYPE_GLM_EDGE,
-    PROJECTOR_TYPE_MERGER,
+    PROJECTOR_TYPE_QWEN2VL,
     PROJECTOR_TYPE_GEMMA3,
     PROJECTOR_TYPE_IDEFICS3,
     PROJECTOR_TYPE_PIXTRAL,
@@ -109,9 +102,9 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_MLP,       "mlp" },
     { PROJECTOR_TYPE_LDP,       "ldp" },
     { PROJECTOR_TYPE_LDPV2,     "ldpv2"},
-    { PROJECTOR_TYPE_RESAMPLER, "resampler"},
+    { PROJECTOR_TYPE_MINICPMV,  "resampler"},
     { PROJECTOR_TYPE_GLM_EDGE,  "adapter"},
-    { PROJECTOR_TYPE_MERGER,    "qwen2vl_merger"},
+    { PROJECTOR_TYPE_QWEN2VL,   "qwen2vl_merger"},
     { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
     { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
     { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index da8a590f0e563..e8c01c68a9779 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -308,13 +308,8 @@ struct clip_vision_model {
 };
 
 struct clip_ctx {
-    bool has_text_encoder    = false;
-    bool has_vision_encoder  = false;
     bool has_llava_projector = false;
-    bool has_minicpmv_projector = false;
-    bool has_glm_projector = false;
-    bool has_qwen2vl_merger = false;
-    int minicpmv_version = 2;
+    int minicpmv_version = 0;
 
     struct clip_vision_model vision_model;
     projector_type proj_type = PROJECTOR_TYPE_MLP;
@@ -373,23 +368,20 @@ struct clip_ctx {
     }
 };
 
-static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
+static ggml_cgraph * clip_image_build_graph_siglip(clip_ctx * ctx, const clip_image_f32 & img) {
     const auto & model = ctx->vision_model;
     const auto & hparams = model.hparams;
 
-    const int image_size = hparams.image_size;
-    int image_size_width  = image_size;
-    int image_size_height = image_size;
+    int image_size_width  = img.nx;
+    int image_size_height = img.ny;
 
-    const int patch_size           = hparams.patch_size;
-    const int num_patches          = ((image_size_width / patch_size) * (image_size_height / patch_size));
-    const int hidden_size          = hparams.hidden_size;
-    const int n_head               = hparams.n_head;
-    const int d_head               = hidden_size / n_head;
-    const int n_layer              = hparams.n_layer;
-    const float eps                = hparams.eps;
-
-    GGML_ASSERT(imgs.entries.size() == 1); // batch_size == 1
+    const int patch_size  = hparams.patch_size;
+    const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    const int hidden_size = hparams.hidden_size;
+    const int n_head      = hparams.n_head;
+    const int d_head      = hidden_size / n_head;
+    const int n_layer     = hparams.n_layer;
+    const float eps       = hparams.eps;
 
     struct ggml_init_params params = {
         /*.mem_size   =*/ ctx->buf_compute_meta.size(),
@@ -621,15 +613,14 @@ static ggml_tensor * build_rope_2d(
     return cur;
 }
 
-static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
+static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_image_f32 & img) {
     const auto & model = ctx->vision_model;
     const auto & hparams = model.hparams;
 
     GGML_ASSERT(ctx->proj_type == PROJECTOR_TYPE_PIXTRAL);
-    GGML_ASSERT(imgs.entries.size() == 1); // batch_size == 1
 
-    int image_size_width  = imgs.entries[0]->nx;
-    int image_size_height = imgs.entries[0]->ny;
+    int image_size_width  = img.nx;
+    int image_size_height = img.ny;
 
     const int patch_size  = hparams.patch_size;
     const int n_patches_x = image_size_width  / patch_size;
@@ -772,18 +763,14 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
 }
 
 static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
-    if (!ctx->has_vision_encoder) {
-        LOG_ERR("This gguf file seems to have no vision encoder\n");
-        return nullptr;
-    }
-
     const auto & model = ctx->vision_model;
     const auto & hparams = model.hparams;
 
     const int image_size = hparams.image_size;
     int image_size_width  = image_size;
     int image_size_height = image_size;
-    if (ctx->has_minicpmv_projector) {
+
+    if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
         LOG_DBG("%s: %d %d\n", __func__, load_image_size.width, load_image_size.height);
         image_size_width  = load_image_size.width;
         image_size_height = load_image_size.height;
@@ -792,7 +779,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
             image_size_height = imgs.entries[0]->ny;
         }
     }
-    else if (ctx->has_qwen2vl_merger) {
+
+    else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
         // use the image's native resolution when image is avaible
         if (is_inf) {
         // if (imgs->data->nx && imgs->data->ny) {
@@ -800,12 +788,13 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
             image_size_height = imgs.entries[0]->ny;
         }
     }
+
     const int patch_size           = hparams.patch_size;
     const int num_patches          = ((image_size_width / patch_size) * (image_size_height / patch_size));
     const int patches_w            = image_size_width / patch_size;
     const int patches_h            = image_size_height / patch_size;
     const int num_positions        = num_patches + (model.class_embedding ? 1 : 0);
-    const int num_position_ids     = ctx->has_qwen2vl_merger ? num_positions * 4 : num_positions;
+    const int num_position_ids     = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL ? num_positions * 4 : num_positions;
     const int hidden_size          = hparams.hidden_size;
     const int n_head               = hparams.n_head;
     const int d_head               = hidden_size / n_head;
@@ -814,7 +803,9 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
 
     const int batch_size = imgs.entries.size();
 
-    if (ctx->has_llava_projector || ctx->has_minicpmv_projector || ctx->has_glm_projector) {
+    if (ctx->has_llava_projector
+            || ctx->proj_type == PROJECTOR_TYPE_MINICPMV
+            || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
         GGML_ASSERT(batch_size == 1);
     }
 
@@ -835,8 +826,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
 
     struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
 
-    if (ctx->has_qwen2vl_merger) {
-        GGML_ASSERT(image_size_width % (patch_size * 2) == 0);
+    if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
+        GGML_ASSERT(image_size_width  % (patch_size * 2) == 0);
         GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
 
         auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
@@ -865,29 +856,26 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
     struct ggml_tensor * embeddings = inp;
     struct ggml_tensor * pos_embed = nullptr;
 
-    if (ctx->has_llava_projector) {
-        // concat class_embeddings and patch_embeddings
-        if (model.class_embedding) {
-            embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
-            ggml_set_name(embeddings, "embeddings");
-            ggml_set_input(embeddings);
-            embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
-                    embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
-            embeddings = ggml_acc(ctx0, embeddings, inp,
-                    embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
-        }
+    // concat class_embeddings and patch_embeddings
+    if (model.class_embedding) {
+        embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
+        embeddings = ggml_scale(ctx0, embeddings, 0.0f); // set to all zeros
+        embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
+                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
+        embeddings = ggml_acc(ctx0, embeddings, inp,
+                embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
     }
 
     struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
     ggml_set_name(positions, "positions");
     ggml_set_input(positions);
 
-    if (!ctx->has_qwen2vl_merger) { // qwen2vl use rope position embedding
+    if (ctx->proj_type != PROJECTOR_TYPE_QWEN2VL) { // qwen2vl does NOT use learned position embeddings
         embeddings =
             ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
     }
 
-    if (ctx->has_minicpmv_projector) {
+    if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
         int pos_w = image_size_width/patch_size;
         int pos_h = image_size_height/patch_size;
         if (ctx->minicpmv_version == 2) {
@@ -941,7 +929,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
                 ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
 
             Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
-            if (ctx->has_qwen2vl_merger) {
+            if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
                 Q = ggml_rope_multi(
                     ctx0, Q, positions, nullptr,
                     d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
@@ -953,7 +941,7 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
                 ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
 
             K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
-            if (ctx->has_qwen2vl_merger) {
+            if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
                 K = ggml_rope_multi(
                     ctx0, K, positions, nullptr,
                     d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
@@ -1218,106 +1206,98 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
         }
     }
     // minicpmv projector
-    else if (ctx->has_minicpmv_projector)
-    {
-        if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
-            struct ggml_tensor * q = model.mm_model_query;
-            { // layernorm
-                q = ggml_norm(ctx0, q, eps);
-                q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
+    else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
+        struct ggml_tensor * q = model.mm_model_query;
+        { // layernorm
+            q = ggml_norm(ctx0, q, eps);
+            q = ggml_add(ctx0, ggml_mul(ctx0, q, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
+        }
+        struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
+        { // layernorm
+            v = ggml_norm(ctx0, v, eps);
+            v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
+        }
+        struct ggml_tensor * k;
+        { // position
+            // q = ggml_add(ctx0, q, model.mm_model_pos_embed);
+            k = ggml_add(ctx0, v, pos_embed);
+        }
+
+        { // attention
+            int hidden_size = 4096;
+            const int d_head = 128;
+            int n_head = hidden_size/d_head;
+            int num_query = 96;
+            if (ctx->minicpmv_version == 2) {
+                hidden_size = 4096;
+                n_head = hidden_size/d_head;
+                num_query = 96;
             }
-            struct ggml_tensor * v = ggml_mul_mat(ctx0, model.mm_model_kv_proj, embeddings);
-            { // layernorm
-                v = ggml_norm(ctx0, v, eps);
-                v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
+            else if (ctx->minicpmv_version == 3) {
+                hidden_size = 3584;
+                n_head = hidden_size/d_head;
+                num_query = 64;
             }
-            struct ggml_tensor * k;
-            { // position
-                // q = ggml_add(ctx0, q, model.mm_model_pos_embed);
-                k = ggml_add(ctx0, v, pos_embed);
+            else if (ctx->minicpmv_version == 4) {
+                hidden_size = 3584;
+                n_head = hidden_size/d_head;
+                num_query = 64;
             }
 
-            { // attention
-                int hidden_size = 4096;
-                const int d_head = 128;
-                int n_head = hidden_size/d_head;
-                int num_query = 96;
-                if (ctx->minicpmv_version == 2) {
-                    hidden_size = 4096;
-                    n_head = hidden_size/d_head;
-                    num_query = 96;
-                }
-                else if (ctx->minicpmv_version == 3) {
-                    hidden_size = 3584;
-                    n_head = hidden_size/d_head;
-                    num_query = 64;
-                }
-                else if (ctx->minicpmv_version == 4) {
-                    hidden_size = 3584;
-                    n_head = hidden_size/d_head;
-                    num_query = 64;
-                }
+            struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
+            struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
+            struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
+            // permute
+            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
+            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
+            Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
+            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
+            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
+            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
+            V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
+            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
+            V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
+            KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
+            KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
 
-                struct ggml_tensor * Q = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q), model.mm_model_attn_q_b);
-                struct ggml_tensor * K = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_k_w, k), model.mm_model_attn_k_b);
-                struct ggml_tensor * V = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_v_w, v), model.mm_model_attn_v_b);
-                // permute
-                Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_query, batch_size);
-                Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
-                Q = ggml_reshape_3d(ctx0, Q, d_head, num_query, n_head * batch_size);
-                K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
-                K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
-                K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
-                V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
-                V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
-                V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
-                struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-                KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
-                struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
-                KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_query, n_head, batch_size);
-                KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-                KQV = ggml_cont_3d(ctx0, KQV, hidden_size, num_query, batch_size);
-
-                embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
-            }
-            { // layernorm
-                embeddings = ggml_norm(ctx0, embeddings, eps);
-                embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
-            }
-            embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
+            embeddings = ggml_add(ctx0, ggml_mul_mat(ctx0, model.mm_model_attn_o_w, KQV), model.mm_model_attn_o_b);
         }
-        else {
-            GGML_ASSERT(false);
+        { // layernorm
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_post_w), model.mm_model_ln_post_b);
         }
+        embeddings = ggml_mul_mat(ctx0, model.mm_model_proj, embeddings);
     }
+
     // glm projector
-    else if (ctx->has_glm_projector) {
-        if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
-            size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
-            embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
-            embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
-            embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
-            embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
-            embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
-            embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
-            //GLU
-            {
-                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
-                embeddings = ggml_norm(ctx0, embeddings, eps);
-                embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
-                embeddings = ggml_gelu_inplace(ctx0, embeddings);
-                struct ggml_tensor * x = embeddings;
-                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
-                x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
-                embeddings = ggml_silu_inplace(ctx0, embeddings);
-                embeddings = ggml_mul(ctx0, embeddings,x);
-                embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
-            }
-        } else {
-            GGML_ABORT("fatal error");
+    else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+        size_t gridsz = (size_t)sqrt(embeddings->ne[1]);
+        embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings,1,0,2,3));
+        embeddings = ggml_reshape_3d(ctx0, embeddings, gridsz, gridsz, embeddings->ne[1]);
+        embeddings = ggml_conv_2d(ctx0, model.mm_model_adapter_conv_w, embeddings, 2, 2, 0, 0, 1, 1);
+        embeddings = ggml_reshape_3d(ctx0, embeddings,embeddings->ne[0]*embeddings->ne[1] , embeddings->ne[2], batch_size);
+        embeddings = ggml_cont(ctx0, ggml_permute(ctx0,embeddings, 1, 0, 2, 3));
+        embeddings = ggml_add(ctx0, embeddings, model.mm_model_adapter_conv_b);
+        // GLU
+        {
+            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings);
+            embeddings = ggml_norm(ctx0, embeddings, eps);
+            embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_model_ln_q_w), model.mm_model_ln_q_b);
+            embeddings = ggml_gelu_inplace(ctx0, embeddings);
+            struct ggml_tensor * x = embeddings;
+            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
+            x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
+            embeddings = ggml_silu_inplace(ctx0, embeddings);
+            embeddings = ggml_mul(ctx0, embeddings,x);
+            embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
         }
     }
-    else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
+
+    else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
         embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
 
         embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
@@ -1343,11 +1323,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
         case PROJECTOR_TYPE_GEMMA3:
         case PROJECTOR_TYPE_IDEFICS3:
             {
-                res = clip_image_build_graph_siglip(ctx, imgs);
+                GGML_ASSERT(imgs.entries.size() == 1);
+                res = clip_image_build_graph_siglip(ctx, *imgs.entries[0]);
             } break;
         case PROJECTOR_TYPE_PIXTRAL:
             {
-                res = clip_image_build_graph_pixtral(ctx, imgs);
+                GGML_ASSERT(imgs.entries.size() == 1);
+                res = clip_image_build_graph_pixtral(ctx, *imgs.entries[0]);
             } break;
         default:
             {
@@ -1419,8 +1401,8 @@ struct clip_model_loader {
         auto & hparams = ctx_clip.vision_model.hparams;
 
         // projector type
+        std::string proj_type;
         {
-            std::string proj_type;
             get_string(KEY_PROJ_TYPE, proj_type, false);
             if (!proj_type.empty()) {
                 ctx_clip.proj_type = clip_projector_type_from_string(proj_type);
@@ -1432,33 +1414,27 @@ struct clip_model_loader {
 
         // other hparams
         {
-            get_bool(KEY_HAS_TEXT_ENC, ctx_clip.has_text_encoder, false);
-            get_bool(KEY_HAS_VIS_ENC, ctx_clip.has_vision_encoder, false);
-            GGML_ASSERT(ctx_clip.has_vision_encoder);
-            GGML_ASSERT(!ctx_clip.has_text_encoder);
-
-            // legacy keys, use KEY_PROJ_TYPE instead
-            get_bool(KEY_HAS_LLAVA_PROJ, ctx_clip.has_llava_projector, false);
-            get_bool(KEY_HAS_MINICPMV_PROJ, ctx_clip.has_minicpmv_projector, false);
             get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false);
-            get_bool(KEY_HAS_GLM_PROJ, ctx_clip.has_glm_projector, false);
-            get_bool(KEY_HAS_QWEN2VL_MERGER, ctx_clip.has_qwen2vl_merger, false);
-            // !!! do NOT extend the list above, use KEY_PROJ_TYPE instead
 
             get_bool(KEY_USE_GELU, ctx_clip.use_gelu, false);
             get_bool(KEY_USE_SILU, ctx_clip.use_silu, false);
 
-            get_u32(string_format(KEY_N_EMBD,         "vision"), hparams.hidden_size);
-            get_u32(string_format(KEY_N_HEAD,         "vision"), hparams.n_head);
-            get_u32(string_format(KEY_N_FF,           "vision"), hparams.n_intermediate);
-            get_u32(string_format(KEY_N_BLOCK,        "vision"), hparams.n_layer);
-            get_u32(string_format(KEY_PROJ_DIM,       "vision"), hparams.projection_dim);
-            get_f32(string_format(KEY_LAYER_NORM_EPS, "vision"), hparams.eps);
-            get_u32(KEY_IMAGE_SIZE, hparams.image_size);
-            get_u32(KEY_PATCH_SIZE, hparams.patch_size);
-            get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
+            get_u32(KEY_N_EMBD,         hparams.hidden_size);
+            get_u32(KEY_N_HEAD,         hparams.n_head);
+            get_u32(KEY_N_FF,           hparams.n_intermediate);
+            get_u32(KEY_N_BLOCK,        hparams.n_layer);
+            get_u32(KEY_PROJ_DIM,       hparams.projection_dim);
+            get_f32(KEY_LAYER_NORM_EPS, hparams.eps);
+            get_u32(KEY_IMAGE_SIZE,     hparams.image_size);
+            get_u32(KEY_PATCH_SIZE,     hparams.patch_size);
+            get_u32(KEY_IMAGE_CROP_RESOLUTION,    hparams.image_crop_resolution, false);
             get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
 
+            ctx_clip.has_llava_projector = ctx_clip.proj_type == PROJECTOR_TYPE_MLP
+                                        || ctx_clip.proj_type == PROJECTOR_TYPE_MLP_NORM
+                                        || ctx_clip.proj_type == PROJECTOR_TYPE_LDP
+                                        || ctx_clip.proj_type == PROJECTOR_TYPE_LDPV2;
+
             {
                 std::string mm_patch_merge_type;
                 get_string(KEY_MM_PATCH_MERGE_TYPE, mm_patch_merge_type, false);
@@ -1491,32 +1467,56 @@ struct clip_model_loader {
             for (auto & layer : vision_feature_layer) {
                 hparams.vision_feature_layer.insert(layer);
             }
+
             // Calculate the deepest feature layer based on hparams and projector type
-            ctx_clip.max_feature_layer = get_deepest_feature_layer(&ctx_clip);
+            // NOTE: This is only used by build_graph_legacy()
+            {
+                // Get the index of the second to last layer; this is the default for models that have a llava projector
+                int n_layer = hparams.n_layer - 1;
+                int deepest_feature_layer = -1;
+
+                if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV
+                        || ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE
+                        || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL) {
+                    n_layer += 1;
+                }
+
+                // If we set explicit vision feature layers, only go up to the deepest one
+                // NOTE: only used by granite-vision models for now
+                for (const auto & feature_layer : hparams.vision_feature_layer) {
+                    if (feature_layer > deepest_feature_layer) {
+                        deepest_feature_layer = feature_layer;
+                    }
+                }
+                ctx_clip.max_feature_layer = deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
+            }
+
+            // model-specific params
+            switch (ctx_clip.proj_type) {
+                case PROJECTOR_TYPE_MINICPMV:
+                    {
+                        if (ctx_clip.minicpmv_version == 0) {
+                            ctx_clip.minicpmv_version = 2; // default to 2 if not set
+                        }
+                    } break;
+                case PROJECTOR_TYPE_IDEFICS3:
+                    {
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
+                    } break;
+                case PROJECTOR_TYPE_PIXTRAL:
+                    {
+                        hparams.rope_theta = 10000.0f;
+                    } break;
+                default:
+                    break;
+            }
 
-            LOG_INF("%s: text_encoder:       %d\n", __func__, ctx_clip.has_text_encoder);
-            LOG_INF("%s: vision_encoder:     %d\n", __func__, ctx_clip.has_vision_encoder);
-            LOG_INF("%s: llava_projector:    %d\n", __func__, ctx_clip.has_llava_projector);
-            LOG_INF("%s: minicpmv_projector: %d\n", __func__, ctx_clip.has_minicpmv_projector);
+            LOG_INF("%s: projector:          %s\n", __func__, proj_type.c_str());
+            LOG_INF("%s: has_llava_proj:     %d\n", __func__, ctx_clip.has_llava_projector);
             LOG_INF("%s: minicpmv_version:   %d\n", __func__, ctx_clip.minicpmv_version);
-            LOG_INF("%s: glm_projector:      %d\n", __func__, ctx_clip.has_glm_projector);
             LOG_INF("%s: model size:         %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
             LOG_INF("%s: metadata size:      %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
         }
-
-        // model-specific params
-        switch (ctx_clip.proj_type) {
-            case PROJECTOR_TYPE_IDEFICS3:
-                {
-                    get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor, false);
-                } break;
-            case PROJECTOR_TYPE_PIXTRAL:
-                {
-                    hparams.rope_theta = 10000.0f;
-                } break;
-            default:
-                break;
-        }
     }
 
     void load_tensors() {
@@ -1569,9 +1569,6 @@ struct clip_model_loader {
         vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
         vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD,   false);
         vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
-        if (vision_model.patch_embeddings_1 == nullptr) {
-            ctx_clip.has_qwen2vl_merger = false;
-        }
 
         vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, "v"), false);
 
@@ -1669,7 +1666,7 @@ struct clip_model_loader {
                     vision_model.mm_model_peg_0_w = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "weight"));
                     vision_model.mm_model_peg_0_b = get_tensor(string_format(TN_MVLM_PROJ_PEG, 0, "bias"));
                 } break;
-            case PROJECTOR_TYPE_RESAMPLER:
+            case PROJECTOR_TYPE_MINICPMV:
                 {
                     // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
                     vision_model.mm_model_pos_embed_k = get_tensor(TN_MINICPMV_POS_EMBD_K);
@@ -1702,7 +1699,7 @@ struct clip_model_loader {
                     vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
                     vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
                 } break;
-            case PROJECTOR_TYPE_MERGER:
+            case PROJECTOR_TYPE_QWEN2VL:
                 {
                     vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
                     vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
@@ -2479,11 +2476,6 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
 // returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
 // res_imgs memory is being allocated here, previous allocations will be freed if found
 bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
-    if (!ctx->has_vision_encoder) {
-        LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
-        return false;
-    }
-
     clip_image_size original_size{img->nx, img->ny};
     bool pad_to_square = true;
     auto & params = ctx->vision_model.hparams;
@@ -2504,7 +2496,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         }
         return true;
     }
-    else if (ctx->has_qwen2vl_merger) {
+    else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
         clip_image_u8 resized;
         auto patch_size = clip_get_patch_size(ctx) * 2;
         int nx = ceil((float)img->nx / patch_size) * patch_size;
@@ -2518,7 +2510,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         res_imgs->entries.push_back(std::move(img_f32));
         return true;
     }
-    else if (ctx->has_glm_projector
+    else if (ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE
             || ctx->proj_type == PROJECTOR_TYPE_GEMMA3
             || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
         clip_image_u8 resized_image;
@@ -2646,7 +2638,7 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
 
     if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2 || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
         n_patches /= 4;
-    } else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
+    } else if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
         if (ctx->minicpmv_version == 2) {
             n_patches = 96;
         }
@@ -2656,7 +2648,10 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
         else if (ctx->minicpmv_version == 4) {
             n_patches = 64;
         }
-    } else if (ctx->proj_type == PROJECTOR_TYPE_MERGER) {
+        else {
+            GGML_ABORT("Unknown minicpmv version");
+        }
+    } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
         int patch_size = params.patch_size * 2;
         int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
         int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
@@ -2761,11 +2756,6 @@ static std::vector<std::vector<float>> get_2d_sincos_pos_embed(int embed_dim, co
 }
 
 bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f32 * img, float * vec) {
-    if (!ctx->has_vision_encoder) {
-        LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
-        return false;
-    }
-
     clip_image_f32_batch imgs;
     clip_image_f32_ptr img_copy(clip_image_f32_init());
     *img_copy = *img;
@@ -2776,20 +2766,11 @@ bool clip_image_encode(struct clip_ctx * ctx, const int n_threads, clip_image_f3
 
 bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_image_f32_batch * imgs_c_ptr, float * vec) {
     const clip_image_f32_batch & imgs = *imgs_c_ptr;
-
-    if (!ctx->has_vision_encoder) {
-        LOG_ERR("%s: This gguf file seems to have no vision encoder\n", __func__);
-        return false;
-    }
-
     int batch_size = imgs.entries.size();
-    if (ctx->has_llava_projector) {
-        GGML_ASSERT(batch_size == 1); // TODO: support multiple images
-    }
-    if (ctx->has_minicpmv_projector) {
-        GGML_ASSERT(batch_size == 1);
-    }
-    if (ctx->has_glm_projector) {
+
+    if (ctx->has_llava_projector
+            || ctx->proj_type == PROJECTOR_TYPE_MINICPMV
+            || ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE) {
         GGML_ASSERT(batch_size == 1);
     }
 
@@ -2799,21 +2780,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
 
     // set inputs
-    const auto & model = ctx->vision_model;
+    const auto & model   = ctx->vision_model;
     const auto & hparams = model.hparams;
 
-    // TODO @ngxson : this is ugly, need to refactor later
-    bool support_dynamic_size = ctx->has_minicpmv_projector
-        || ctx->has_qwen2vl_merger
-        || ctx->proj_type == PROJECTOR_TYPE_PIXTRAL;
+    const int image_size_width  = imgs.entries[0]->nx;
+    const int image_size_height = imgs.entries[0]->ny;
 
-    const int image_size = hparams.image_size;
-    int image_size_width  = image_size;
-    int image_size_height = image_size;
-    if (support_dynamic_size) {
-        image_size_width  = imgs.entries[0]->nx;
-        image_size_height = imgs.entries[0]->ny;
-    }
     const int patch_size    = hparams.patch_size;
     const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
     const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
@@ -2839,14 +2811,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         for (size_t i = 0; i < imgs.entries.size(); i++) {
             const int nx = imgs.entries[i]->nx;
             const int ny = imgs.entries[i]->ny;
-
-            if (ctx->has_glm_projector
-                    || ctx->has_llava_projector
-                    || ctx->proj_type == PROJECTOR_TYPE_GEMMA3
-                    || ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
-                GGML_ASSERT(nx == image_size && ny == image_size);
-            }
-
             const int n = nx * ny;
 
             for (int b = 0; b < batch_size; b++) {
@@ -2864,13 +2828,15 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         }
         ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
     }
-    if (ctx->has_minicpmv_projector) {
+
+    if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
         {
             // inspired from siglip:
             //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
             //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
             struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
-            int* positions_data = (int*)malloc(ggml_nbytes(positions));
+            std::vector<int> pos_data(ggml_nelements(positions));
+            int * data = pos_data.data();
             int bucket_coords_h[1024];
             int bucket_coords_w[1024];
             for (int i = 0; i < pos_h; i++){
@@ -2881,11 +2847,10 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             }
             for (int i = 0, id = 0; i < pos_h; i++){
                 for (int j = 0; j < pos_w; j++){
-                    positions_data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
+                    data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
                 }
             }
-            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
-            free(positions_data);
+            ggml_backend_tensor_set(positions, data, 0, ggml_nbytes(positions));
         }
 
         {
@@ -2903,30 +2868,28 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             else if (ctx->minicpmv_version == 4) {
                 embed_dim = 3584;
             }
+            else {
+                GGML_ABORT("Unknown minicpmv version");
+            }
+
+            // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
             auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
 
-            float * pos_embed_data = (float *)malloc(ggml_nbytes(pos_embed));
-            for(int i=0;i < pos_w * pos_h; ++i){
-                for(int j=0; j < embed_dim; ++j){
-                    pos_embed_data[i * embed_dim + j] = pos_embed_t[i][j];
+            std::vector<float> pos_data(ggml_nelements(pos_embed));
+            float * data = pos_data.data();
+            for(int i = 0; i < pos_w * pos_h; ++i){
+                for(int j = 0; j < embed_dim; ++j){
+                    data[i * embed_dim + j] = pos_embed_t[i][j];
                 }
             }
 
-            ggml_backend_tensor_set(pos_embed, pos_embed_data, 0, ggml_nbytes(pos_embed));
-            free(pos_embed_data);
+            ggml_backend_tensor_set(pos_embed, data, 0, ggml_nbytes(pos_embed));
         }
     }
     else {
-        if (model.class_embedding) {
-            struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
+        // non-minicpmv models
 
-            void* zero_mem = malloc(ggml_nbytes(embeddings));
-            memset(zero_mem, 0, ggml_nbytes(embeddings));
-            ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
-            free(zero_mem);
-        }
-
-        if (ctx->has_qwen2vl_merger) {
+        if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
             struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
 
             const int pw = image_size_width / patch_size;
@@ -2978,6 +2941,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             ggml_backend_tensor_set(pos, pos_data.data(), 0, ggml_nbytes(pos));
         }
         else {
+            // llava and other models
             struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
 
             int* positions_data = (int*)malloc(ggml_nbytes(positions));
@@ -2987,7 +2951,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
             free(positions_data);
 
-            if (!ctx->has_glm_projector) {
+            if (ctx->proj_type != PROJECTOR_TYPE_GLM_EDGE) {
                 struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
                 // The patches vector is used to get rows to index into the embeds with;
                 // we should skip dim 0 only if we have CLS to avoid going out of bounds
@@ -3166,7 +3130,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
             return ctx->vision_model.mm_2_b->ne[0];
         case PROJECTOR_TYPE_MLP_NORM:
             return ctx->vision_model.mm_3_b->ne[0];
-        case PROJECTOR_TYPE_RESAMPLER:
+        case PROJECTOR_TYPE_MINICPMV:
             if (ctx->minicpmv_version == 2) {
                 return 4096;
             } else if (ctx->minicpmv_version == 3) {
@@ -3174,36 +3138,33 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
             } else if (ctx->minicpmv_version == 4) {
                 return 3584;
             }
-            break; // Should not happen if version is valid
+            GGML_ABORT("Unknown minicpmv version");
         case PROJECTOR_TYPE_GLM_EDGE:
             return ctx->vision_model.mm_model_mlp_3_w->ne[1];
-        case PROJECTOR_TYPE_MERGER:
+        case PROJECTOR_TYPE_QWEN2VL:
             return ctx->vision_model.mm_1_b->ne[0];
         case PROJECTOR_TYPE_GEMMA3:
             return ctx->vision_model.mm_input_proj_w->ne[0];
         case PROJECTOR_TYPE_IDEFICS3:
             return ctx->vision_model.projection->ne[1];
         default:
-            break; // Fall through to throw
+            GGML_ABORT("Unknown projector type");
     }
-
-    std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];
-    throw std::runtime_error(string_format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str()));
 }
 
 int clip_is_minicpmv(const struct clip_ctx * ctx) {
-    if (ctx->has_minicpmv_projector) {
+    if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
         return ctx->minicpmv_version;
     }
     return 0;
 }
 
 bool clip_is_glm(const struct clip_ctx * ctx) {
-    return ctx->has_glm_projector;
+    return ctx->proj_type == PROJECTOR_TYPE_GLM_EDGE;
 }
 
 bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
-    return ctx->has_qwen2vl_merger;
+    return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL;
 }
 
 bool clip_is_llava(const struct clip_ctx * ctx) {
@@ -3214,29 +3175,6 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
     return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
 }
 
-// Determine the number of encoder layers to iterate over
-int get_deepest_feature_layer(const struct clip_ctx * ctx) {
-    // Get the index of the second to last layer; this is the
-    // default for models that have a llava projector
-    const auto & hparams = ctx->vision_model.hparams;
-    int n_layer = hparams.n_layer - 1;
-    int deepest_feature_layer = -1;
-
-    // Handle other projectors; incrementing here indicates that we
-    // should use the last encoder layer for the vision features.
-    if (ctx->has_minicpmv_projector || ctx->has_glm_projector || ctx->has_qwen2vl_merger) {
-        n_layer += 1;
-    }
-
-    // If we set explicit vision feature layers, only go up to the deepest one
-    for (const auto & feature_layer : hparams.vision_feature_layer) {
-        if (feature_layer > deepest_feature_layer) {
-            deepest_feature_layer = feature_layer;
-        }
-    }
-    return deepest_feature_layer < 0 ? n_layer : deepest_feature_layer;
-}
-
 bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
     clip_image_f32 clip_img;
     clip_img.buf.resize(h * w * 3);
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 5fc45d3e23904..6ba42ad892146 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -114,8 +114,6 @@ CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
 CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
 
-CLIP_API int get_deepest_feature_layer(const struct clip_ctx * ctx);
-
 CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
 
 
From 2d451c80590b9ac250322769ac13d3b4870dbcf7 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Sat, 26 Apr 2025 22:58:12 +0200
Subject: [PATCH 037/200] common : add common_remote_get_content (#13123)

* common : add common_remote_get_content

* support max size and timeout

* add tests
---
 common/arg.cpp            | 105 ++++++++++++++++++++++++++------------
 common/arg.h              |   9 ++++
 tests/test-arg-parser.cpp |  47 +++++++++++++++++
 3 files changed, 127 insertions(+), 34 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 0657553e4e9cf..de173159f4a76 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -162,6 +162,10 @@ struct common_hf_file_res {
 
 #ifdef LLAMA_USE_CURL
 
+bool common_has_curl() {
+    return true;
+}
+
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
@@ -527,64 +531,89 @@ static bool common_download_model(
     return true;
 }
 
-/**
- * Allow getting the HF file from the HF repo with tag (like ollama), for example:
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
- * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
- * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
- *
- * Return pair of <repo, file> (with "repo" already having tag removed)
- *
- * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
- */
-static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
-    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
-    std::string tag = parts.size() > 1 ? parts.back() : "latest";
-    std::string hf_repo = parts[0];
-    if (string_split<std::string>(hf_repo, '/').size() != 2) {
-        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
-    }
-
-    // fetch model info from Hugging Face Hub API
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
     curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
     curl_slist_ptr http_headers;
-    std::string res_str;
+    std::vector<char> res_buffer;
 
-    std::string model_endpoint = get_model_endpoint();
-
-    std::string url = model_endpoint + "v2/" + hf_repo + "/manifests/" + tag;
     curl_easy_setopt(curl.get(), CURLOPT_URL, url.c_str());
     curl_easy_setopt(curl.get(), CURLOPT_NOPROGRESS, 1L);
+    curl_easy_setopt(curl.get(), CURLOPT_FOLLOWLOCATION, 1L);
     typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * ptr, size_t size, size_t nmemb, void * data);
     auto write_callback = [](void * ptr, size_t size, size_t nmemb, void * data) -> size_t {
-        static_cast<std::string *>(data)->append((char * ) ptr, size * nmemb);
+        auto data_vec = static_cast<std::vector<char> *>(data);
+        data_vec->insert(data_vec->end(), (char *)ptr, (char *)ptr + size * nmemb);
         return size * nmemb;
     };
     curl_easy_setopt(curl.get(), CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
-    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_str);
+    curl_easy_setopt(curl.get(), CURLOPT_WRITEDATA, &res_buffer);
 #if defined(_WIN32)
     curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
 #endif
-    if (!bearer_token.empty()) {
-        std::string auth_header = "Authorization: Bearer " + bearer_token;
-        http_headers.ptr = curl_slist_append(http_headers.ptr, auth_header.c_str());
+    if (params.timeout > 0) {
+        curl_easy_setopt(curl.get(), CURLOPT_TIMEOUT, params.timeout);
+    }
+    if (params.max_size > 0) {
+        curl_easy_setopt(curl.get(), CURLOPT_MAXFILESIZE, params.max_size);
     }
-    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
     http_headers.ptr = curl_slist_append(http_headers.ptr, "User-Agent: llama-cpp");
-    http_headers.ptr = curl_slist_append(http_headers.ptr, "Accept: application/json");
+    for (const auto & header : params.headers) {
+        http_headers.ptr = curl_slist_append(http_headers.ptr, header.c_str());
+    }
     curl_easy_setopt(curl.get(), CURLOPT_HTTPHEADER, http_headers.ptr);
 
     CURLcode res = curl_easy_perform(curl.get());
 
     if (res != CURLE_OK) {
-        throw std::runtime_error("error: cannot make GET request to HF API");
+        std::string error_msg = curl_easy_strerror(res);
+        throw std::runtime_error("error: cannot make GET request: " + error_msg);
     }
 
     long res_code;
-    std::string ggufFile   = "";
-    std::string mmprojFile = "";
     curl_easy_getinfo(curl.get(), CURLINFO_RESPONSE_CODE, &res_code);
+
+    return { res_code, std::move(res_buffer) };
+}
+
+/**
+ * Allow getting the HF file from the HF repo with tag (like ollama), for example:
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q4
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:Q4_K_M
+ * - bartowski/Llama-3.2-3B-Instruct-GGUF:q5_k_s
+ * Tag is optional, default to "latest" (meaning it checks for Q4_K_M first, then Q4, then if not found, return the first GGUF file in repo)
+ *
+ * Return pair of <repo, file> (with "repo" already having tag removed)
+ *
+ * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
+ */
+static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
+    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
+    std::string tag = parts.size() > 1 ? parts.back() : "latest";
+    std::string hf_repo = parts[0];
+    if (string_split<std::string>(hf_repo, '/').size() != 2) {
+        throw std::invalid_argument("error: invalid HF repo format, expected <user>/<model>[:quant]\n");
+    }
+
+    std::string url = get_model_endpoint() + "v2/" + hf_repo + "/manifests/" + tag;
+
+    // headers
+    std::vector<std::string> headers;
+    headers.push_back("Accept: application/json");
+    if (!bearer_token.empty()) {
+        headers.push_back("Authorization: Bearer " + bearer_token);
+    }
+    // Important: the User-Agent must be "llama-cpp" to get the "ggufFile" field in the response
+    // User-Agent header is already set in common_remote_get_content, no need to set it here
+
+    // make the request
+    common_remote_params params;
+    params.headers = headers;
+    auto res = common_remote_get_content(url, params);
+    long res_code = res.first;
+    std::string res_str(res.second.data(), res.second.size());
+    std::string ggufFile;
+    std::string mmprojFile;
+
     if (res_code == 200) {
         // extract ggufFile.rfilename in json, using regex
         {
@@ -618,6 +647,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
 
 #else
 
+bool common_has_curl() {
+    return false;
+}
+
 static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
     LOG_ERR("error: built without CURL, cannot download model from internet\n");
     return false;
@@ -640,6 +673,10 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s
     return {};
 }
 
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
+    throw std::runtime_error("error: built without CURL, cannot download model from the internet");
+}
+
 #endif // LLAMA_USE_CURL
 
 //
diff --git a/common/arg.h b/common/arg.h
index 49ab8667b1052..70bea100fd4f2 100644
--- a/common/arg.h
+++ b/common/arg.h
@@ -78,3 +78,12 @@ bool common_params_parse(int argc, char ** argv, common_params & params, llama_e
 
 // function to be used by test-arg-parser
 common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
+bool common_has_curl();
+
+struct common_remote_params {
+    std::vector<std::string> headers;
+    long timeout = 0; // CURLOPT_TIMEOUT, in seconds ; 0 means no timeout
+    long max_size = 0; // max size of the response ; unlimited if 0 ; max is 2GB
+};
+// get remote file content, returns <http_code, raw_response_body>
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params);
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp
index 537fc63a4c975..21dbd5404222f 100644
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -126,6 +126,53 @@ int main(void) {
     assert(params.cpuparams.n_threads == 1010);
 #endif // _WIN32
 
+    if (common_has_curl()) {
+        printf("test-arg-parser: test curl-related functions\n\n");
+        const char * GOOD_URL = "https://raw.githubusercontent.com/ggml-org/llama.cpp/refs/heads/master/README.md";
+        const char * BAD_URL  = "https://www.google.com/404";
+        const char * BIG_FILE = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v1.bin";
+
+        {
+            printf("test-arg-parser: test good URL\n\n");
+            auto res = common_remote_get_content(GOOD_URL, {});
+            assert(res.first == 200);
+            assert(res.second.size() > 0);
+            std::string str(res.second.data(), res.second.size());
+            assert(str.find("llama.cpp") != std::string::npos);
+        }
+
+        {
+            printf("test-arg-parser: test bad URL\n\n");
+            auto res = common_remote_get_content(BAD_URL, {});
+            assert(res.first == 404);
+        }
+
+        {
+            printf("test-arg-parser: test max size error\n");
+            common_remote_params params;
+            params.max_size = 1;
+            try {
+                common_remote_get_content(GOOD_URL, params);
+                assert(false && "it should throw an error");
+            } catch (std::exception & e) {
+                printf("  expected error: %s\n\n", e.what());
+            }
+        }
+
+        {
+            printf("test-arg-parser: test timeout error\n");
+            common_remote_params params;
+            params.timeout = 1;
+            try {
+                common_remote_get_content(BIG_FILE, params);
+                assert(false && "it should throw an error");
+            } catch (std::exception & e) {
+                printf("  expected error: %s\n\n", e.what());
+            }
+        }
+    } else {
+        printf("test-arg-parser: no curl, skipping curl-related functions\n");
+    }
 
     printf("test-arg-parser: all tests OK\n\n");
 }

From ca2bb89eac2097ab4620448737e58af8452e444b Mon Sep 17 00:00:00 2001
From: HimariO <dsfhe49854@gmail.com>
Date: Sun, 27 Apr 2025 16:10:34 +0800
Subject: [PATCH 038/200] clip : Add Qwen2.5VL support (#12402)

* implment vision model architecture, gguf convertor

* handle window attention inputs

* add debug utils

* fix few incorrect tensor memory layout

* move position id remap out of ggml to avoid int32 cuda operations

* cleaning up

* ignore transformers Qwen2_5_xxx type check

* remove not so often use `qwen2vl-cli` debug functions

* remove commented-out code blocks

* fix attn weight scaling after rebase

* add `PROJECTOR_TYPE_QWEN2_5_VL`

* remove `KEY_USE_GLU_MLP`, `KEY_USE_RMS_NORM`

* replace `KEY_FULLATTN_BLK_IDX` with `KEY_WIN_ATTN_PATTERN`

* remove `attn_window_size` from gguf

* fix model conversion

* clean up

* fix merging problem

* add test

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
---
 convert_hf_to_gguf.py              |  11 +-
 examples/llava/clip-impl.h         |   8 +
 examples/llava/clip.cpp            | 398 ++++++++++++++++++++++++++++-
 examples/llava/qwen2_vl_surgery.py | 188 +++++++++-----
 examples/llava/qwen2vl-cli.cpp     |  96 +++++--
 examples/llava/tests.sh            |   1 +
 6 files changed, 597 insertions(+), 105 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index cf35fb86ecfec..ea3a951b93753 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2554,11 +2554,12 @@ def set_vocab(self):
         except FileNotFoundError:
             self._set_vocab_gpt2()
 
-    def get_tensors(self) -> Iterator[tuple[str, Tensor]]:
-        for name, data in super().get_tensors():
-            if name.startswith("visual."):
-                continue
-            yield name, data
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid  # unused
+        if name.startswith("visual."):
+            # skip visual tensors
+            return []
+        return [(self.map_tensor_name(name), data_torch)]
 
 
 @ModelBase.register("WavTokenizerDec")
diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h
index 16d0a8efc56ae..04bfcbb5e575f 100644
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@@ -34,9 +34,14 @@
 #define KEY_PROJ_SCALE_FACTOR   "clip.vision.projector.scale_factor"
 #define KEY_PROJ_TYPE           "clip.projector_type"
 
+#define KEY_USE_GLU_MLP         "clip.use_glu_mlp"  // for qwen2.5vl
+#define KEY_USE_RMS_NORM        "clip.use_rms_norm" // for qwen2.5vl
+
 #define KEY_MM_PATCH_MERGE_TYPE   "clip.vision.mm_patch_merge_type"
 #define KEY_IMAGE_GRID_PINPOINTS  "clip.vision.image_grid_pinpoints"
 #define KEY_IMAGE_CROP_RESOLUTION "clip.vision.image_crop_resolution"
+#define KEY_WIN_ATTN_PATTERN      "clip.vision.n_wa_pattern"
+#define KEY_ATTN_WINDOW_SIZE      "clip.vision.window_size"
 
 
 //
@@ -55,6 +60,7 @@
 #define TN_FFN_DOWN        "%s.blk.%d.ffn_down.%s"
 #define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s"
 #define TN_FFN_UP          "%s.blk.%d.ffn_up.%s"
+#define TN_FFN_GATE        "%s.blk.%d.ffn_gate.%s"
 #define TN_LN_1            "%s.blk.%d.ln1.%s"
 #define TN_LN_2            "%s.blk.%d.ln2.%s"
 #define TN_LN_PRE          "%s.pre_ln.%s"
@@ -95,6 +101,7 @@ enum projector_type {
     PROJECTOR_TYPE_GEMMA3,
     PROJECTOR_TYPE_IDEFICS3,
     PROJECTOR_TYPE_PIXTRAL,
+    PROJECTOR_TYPE_QWEN25VL,
     PROJECTOR_TYPE_UNKNOWN,
 };
 
@@ -105,6 +112,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
     { PROJECTOR_TYPE_MINICPMV,  "resampler"},
     { PROJECTOR_TYPE_GLM_EDGE,  "adapter"},
     { PROJECTOR_TYPE_QWEN2VL,   "qwen2vl_merger"},
+    { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
     { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
     { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
     { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index e8c01c68a9779..b6a1f40e8a580 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -28,6 +28,7 @@
 #include <cinttypes>
 #include <limits>
 #include <array>
+#include <numeric>
 
 struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callback_default, NULL};
 
@@ -169,6 +170,8 @@ struct clip_hparams {
     std::vector<int32_t> image_grid_pinpoints;
     int32_t image_crop_resolution;
     std::unordered_set<int32_t> vision_feature_layer;
+    int32_t attn_window_size;
+    int32_t n_wa_pattern;
 };
 
 struct clip_layer {
@@ -200,6 +203,9 @@ struct clip_layer {
     struct ggml_tensor * ff_down_w = nullptr;
     struct ggml_tensor * ff_down_b = nullptr;
 
+    struct ggml_tensor * ff_g_w = NULL;
+    struct ggml_tensor * ff_g_b = NULL;
+
     // layernorm 2
     struct ggml_tensor * ln_2_w = nullptr;
     struct ggml_tensor * ln_2_b = nullptr;
@@ -319,6 +325,7 @@ struct clip_ctx {
     float image_std[3];
     bool use_gelu = false;
     bool use_silu = false;
+    int32_t ftype = 1;
 
     gguf_context_ptr ctx_gguf;
     ggml_context_ptr ctx_data;
@@ -762,6 +769,236 @@ static ggml_cgraph * clip_image_build_graph_pixtral(clip_ctx * ctx, const clip_i
     return gf;
 }
 
+static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_image_f32_batch & imgs) {
+    const auto & model = ctx->vision_model;
+    const auto & hparams = model.hparams;
+
+    const int image_size_width  = imgs.entries[0]->nx;
+    const int image_size_height = imgs.entries[0]->ny;
+
+    const bool use_mrope       = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
+    const bool use_window_attn = hparams.n_wa_pattern > 0;
+
+    const int n_wa_pattern         = hparams.n_wa_pattern;
+    const int patch_size           = hparams.patch_size;
+    const int num_patches          = ((image_size_width / patch_size) * (image_size_height / patch_size));
+    const int patches_w            = image_size_width / patch_size;
+    const int patches_h            = image_size_height / patch_size;
+    const int num_positions        = num_patches + (model.class_embedding ? 1 : 0);
+    const int num_position_ids     = use_mrope ? num_positions * 4 : num_positions;
+    const int hidden_size          = hparams.hidden_size;
+    const int n_head               = hparams.n_head;
+    const int d_head               = hidden_size / n_head;
+    const float eps                = hparams.eps;
+
+    int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
+
+    const int batch_size = imgs.entries.size();
+    GGML_ASSERT(batch_size == 1);
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx->buf_compute_meta.size(),
+        /*.mem_buffer =*/ ctx->buf_compute_meta.data(),
+        /*.no_alloc   =*/ true,
+    };
+
+    ggml_context_ptr ctx0_ptr(ggml_init(params));
+    auto ctx0 = ctx0_ptr.get();
+
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, image_size_width, image_size_height, 3, batch_size);
+    ggml_set_name(inp_raw, "inp_raw");
+    ggml_set_input(inp_raw);
+
+    struct ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+
+    GGML_ASSERT(image_size_width  % (patch_size * 2) == 0);
+    GGML_ASSERT(image_size_height % (patch_size * 2) == 0);
+
+    auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+    inp = ggml_add(ctx0, inp, inp_1);
+
+    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 2, 0, 3));  // [w, h, c, b] -> [c, w, h, b]
+    inp = ggml_reshape_4d(
+        ctx0, inp,
+        hidden_size * 2, patches_w / 2, patches_h, batch_size);
+    inp = ggml_reshape_4d(
+        ctx0, inp,
+        hidden_size * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
+    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
+    inp = ggml_reshape_3d(
+        ctx0, inp,
+        hidden_size, patches_w * patches_h, batch_size);
+
+    if (model.patch_bias) {
+        // inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
+        inp = ggml_add(ctx0, inp, model.patch_bias);
+    }
+    struct ggml_tensor * embeddings     = inp;
+    struct ggml_tensor * window_mask    = nullptr;
+    struct ggml_tensor * window_idx     = nullptr;
+    struct ggml_tensor * inv_window_idx = nullptr;
+
+    struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_position_ids);
+    ggml_set_name(positions, "positions");
+    ggml_set_input(positions);
+
+    // pre-layernorm
+    if (model.pre_ln_w) {
+        embeddings = ggml_rms_norm(ctx0, embeddings, eps);
+        ggml_set_name(embeddings, "pre_ln");
+
+        embeddings = ggml_mul(ctx0, embeddings, model.pre_ln_w);
+    }
+
+    if (use_window_attn) {
+        // handle window attention inputs
+        inv_window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
+        ggml_set_name(inv_window_idx, "inv_window_idx");
+        ggml_set_input(inv_window_idx);
+        // mask for window attention
+        window_mask = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, num_positions, num_positions);
+        ggml_set_name(window_mask, "window_mask");
+        ggml_set_input(window_mask);
+
+        // embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
+        GGML_ASSERT(batch_size == 1);
+        embeddings = ggml_reshape_2d(ctx0, embeddings, hidden_size * 4, patches_w * patches_h * batch_size / 4);
+        embeddings = ggml_get_rows(ctx0, embeddings, inv_window_idx);
+        embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size, patches_w * patches_h, batch_size);
+    }
+
+    // loop over layers
+    for (int il = 0; il < ctx->max_feature_layer; il++) {
+        struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
+
+        // rmsnorm1
+        cur = ggml_rms_norm(ctx0, cur, eps);
+        cur = ggml_mul(ctx0, cur, model.layers[il].ln_1_w);
+
+        // self-attention
+        {
+
+            struct ggml_tensor * Q =
+                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].q_w, cur), model.layers[il].q_b);
+
+            Q = ggml_reshape_4d(ctx0, Q, d_head, n_head, num_positions, batch_size);
+            Q = ggml_rope_multi(
+                ctx0, Q, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+            Q = ggml_cont(ctx0, ggml_permute(ctx0, Q, 0, 2, 1, 3));
+            Q = ggml_reshape_3d(ctx0, Q, d_head, num_positions, n_head * batch_size);
+
+            struct ggml_tensor * K =
+                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].k_w, cur), model.layers[il].k_b);
+
+            K = ggml_reshape_4d(ctx0, K, d_head, n_head, num_positions, batch_size);
+            K = ggml_rope_multi(
+                ctx0, K, positions, nullptr,
+                d_head/2, mrope_sections, GGML_ROPE_TYPE_VISION, 32768, 10000, 1, 0, 1, 32, 1);
+            K = ggml_cont(ctx0, ggml_permute(ctx0, K, 0, 2, 1, 3));
+            K = ggml_reshape_3d(ctx0, K, d_head, num_positions, n_head * batch_size);
+
+            struct ggml_tensor * V =
+                ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].v_w, cur), model.layers[il].v_b);
+
+            V = ggml_reshape_4d(ctx0, V, d_head, n_head, num_positions, batch_size);
+            V = ggml_cont(ctx0, ggml_permute(ctx0, V, 1, 2, 0, 3));
+            V = ggml_reshape_3d(ctx0, V, num_positions, d_head, n_head * batch_size);
+
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            const bool full_attn = use_window_attn ? (il + 1) % n_wa_pattern == 0 : true;
+            if (full_attn) {
+                KQ = ggml_soft_max_ext(ctx0, KQ, nullptr, 1.0f / sqrtf((float)d_head), 0.0f);
+            } else {
+                KQ = ggml_soft_max_ext(ctx0, KQ, window_mask, 1.0f / sqrtf((float)d_head), 0.0f);
+            }
+
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ);
+            KQV = ggml_reshape_4d(ctx0, KQV, d_head, num_positions, n_head, batch_size);
+            KQV = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            cur = ggml_cont_3d(ctx0, KQV, hidden_size, num_positions, batch_size);
+        }
+
+        // attention output
+        cur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].o_w, cur), model.layers[il].o_b);
+
+        // re-add the layer input, e.g., residual
+        cur = ggml_add(ctx0, cur, embeddings);
+
+        embeddings = cur; // embeddings = residual, cur = hidden_states
+
+        // rms norm2
+        cur = ggml_rms_norm(ctx0, cur, eps);
+        cur = ggml_mul(ctx0, cur, model.layers[il].ln_2_w);
+
+        // mlp
+        // ffn_up
+        auto cur_up = ggml_mul_mat(ctx0, model.layers[il].ff_o_w, cur);
+        cur_up = ggml_add(ctx0, cur_up, model.layers[il].ff_o_b);
+
+        auto cur_gate = ggml_mul_mat(ctx0, model.layers[il].ff_g_w, cur);
+        cur_gate = ggml_add(ctx0, cur_gate, model.layers[il].ff_g_b);
+        // TODO : only 2 of these 3 are actually used, should we remove one of them?
+        if (ctx->use_gelu) {
+            cur_gate = ggml_gelu_inplace(ctx0, cur_gate);
+        } else if (ctx->use_silu) {
+            cur_gate = ggml_silu_inplace(ctx0, cur_gate);
+        } else {
+            cur_gate = ggml_gelu_quick_inplace(ctx0, cur_gate);
+        }
+        cur = ggml_mul(ctx0, cur_gate, cur_up);
+
+        // ffn_down
+        cur = ggml_mul_mat(ctx0, model.layers[il].ff_i_w, cur);
+        cur = ggml_add(ctx0, cur, model.layers[il].ff_i_b);
+
+        // residual 2
+        cur = ggml_add(ctx0, embeddings, cur);
+
+        embeddings = cur;
+    }
+
+    // post-layernorm
+    if (model.post_ln_w) {
+        embeddings = ggml_rms_norm(ctx0, embeddings, eps);
+        ggml_set_name(embeddings, "post_ln");
+
+        embeddings = ggml_mul(ctx0, embeddings, model.post_ln_w);
+    }
+
+    embeddings = ggml_reshape_3d(ctx0, embeddings, hidden_size * 4, num_positions / 4, batch_size);
+
+    embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
+    embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
+
+    // GELU activation
+    embeddings = ggml_gelu(ctx0, embeddings);
+
+    // Second linear layer
+    embeddings = ggml_mul_mat(ctx0, model.mm_1_w, embeddings);
+    embeddings = ggml_add(ctx0, embeddings, model.mm_1_b);
+
+    if (use_window_attn) {
+        window_idx = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions / 4);
+        ggml_set_name(window_idx, "window_idx");
+        ggml_set_input(window_idx);
+
+        // embeddings shape: [hidden_size, patches_w * patches_h, batch_size]
+        GGML_ASSERT(batch_size == 1);
+        embeddings = ggml_reshape_2d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4);
+        embeddings = ggml_get_rows(ctx0, embeddings, window_idx);
+        embeddings = ggml_reshape_3d(ctx0, embeddings, hparams.projection_dim, patches_w * patches_h / 4, batch_size);
+    }
+
+    // build the graph
+    ggml_build_forward_expand(gf, embeddings);
+
+    return gf;
+}
+
 static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_image_f32_batch & imgs, struct clip_image_size load_image_size, bool is_inf = false) {
     const auto & model = ctx->vision_model;
     const auto & hparams = model.hparams;
@@ -1331,6 +1568,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
                 GGML_ASSERT(imgs.entries.size() == 1);
                 res = clip_image_build_graph_pixtral(ctx, *imgs.entries[0]);
             } break;
+        case PROJECTOR_TYPE_QWEN25VL:
+            {
+                res = clip_image_build_graph_qwen25vl(ctx, imgs);
+            } break;
         default:
             {
                 // TODO: we should have one build_* function per model
@@ -1507,6 +1748,10 @@ struct clip_model_loader {
                     {
                         hparams.rope_theta = 10000.0f;
                     } break;
+                case PROJECTOR_TYPE_QWEN25VL:
+                    {
+                        get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
+                    } break;
                 default:
                     break;
             }
@@ -1600,8 +1845,10 @@ struct clip_model_loader {
             // legacy naming (the in and out is reversed! don't ask me why)
             layer.ff_i_w = layer.ff_down_w;
             layer.ff_o_w = layer.ff_up_w;
+            layer.ff_g_w = layer.ff_gate_w;
             layer.ff_i_b = layer.ff_down_b;
             layer.ff_o_b = layer.ff_up_b;
+            layer.ff_g_b = layer.ff_gate_b;
         }
 
         switch (ctx_clip.proj_type) {
@@ -1700,6 +1947,7 @@ struct clip_model_loader {
                     vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
                 } break;
             case PROJECTOR_TYPE_QWEN2VL:
+            case PROJECTOR_TYPE_QWEN25VL:
                 {
                     vision_model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
                     vision_model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
@@ -2651,7 +2899,7 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
         else {
             GGML_ABORT("Unknown minicpmv version");
         }
-    } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
+    } else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
         int patch_size = params.patch_size * 2;
         int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
         int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
@@ -2792,6 +3040,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     const int pos_w = ctx->load_image_size.width / patch_size;
     const int pos_h = ctx->load_image_size.height / patch_size;
 
+    const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
+
     {
         struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
         std::vector<float> inp_data(ggml_nelements(inp_raw));
@@ -2890,31 +3140,93 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         // non-minicpmv models
 
         if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
-            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
+            // pw * ph = number of tokens output by ViT after apply patch merger
+            // ipw * ipw = number of vision token been processed inside ViT
+            const int merge_ratio = 2;
+            const int pw  = image_size_width  / patch_size / merge_ratio;
+            const int ph  = image_size_height / patch_size / merge_ratio;
+            const int ipw = image_size_width  / patch_size;
+            const int iph = image_size_height / patch_size;
+
+            std::vector<int> idx    (ph * pw);
+            std::vector<int> inv_idx(ph * pw);
+
+            if (use_window_attn) {
+                const int attn_window_size = 112;
+                struct ggml_tensor * window_idx     = ggml_graph_get_tensor(gf, "window_idx");
+                struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx");
+                struct ggml_tensor * window_mask    = ggml_graph_get_tensor(gf, "window_mask");
+
+                const int grid_window = attn_window_size / patch_size / merge_ratio;
+                int dst = 0;
+                // [num_vision_tokens, num_vision_tokens] attention mask tensor
+                std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
+                int mask_row = 0;
+
+                for (int y = 0; y < ph; y += grid_window)
+                {
+                    for (int x = 0; x < pw; x += grid_window)
+                    {
+                        const int win_h = std::min(grid_window, ph - y);
+                        const int win_w = std::min(grid_window, pw - x);
+                        const int dst_0 = dst;
+                        // group all tokens belong to the same window togather (to a continue range)
+                        for (int dy = 0; dy < win_h; dy++) {
+                            for (int dx = 0; dx < win_w; dx++) {
+                                const int src = (y + dy) * pw + (x + dx);
+                                assert(src < (int)idx.size());
+                                assert(dst < (int)inv_idx.size());
+                                idx    [src] = dst;
+                                inv_idx[dst] = src;
+                                dst++;
+                            }
+                        }
 
-            const int pw = image_size_width / patch_size;
-            const int ph = image_size_height / patch_size;
-            int* positions_data = (int*)malloc(ggml_nbytes(positions));
+                        for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
+                            int row_offset = mask_row * (ipw * iph);
+                            std::fill(
+                                mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
+                                mask.begin() + row_offset + (dst   * merge_ratio * merge_ratio),
+                                0.0);
+                            mask_row++;
+                        }
+                    }
+                }
+
+                ggml_backend_tensor_set(window_idx,     idx.data(),     0, ggml_nbytes(window_idx));
+                ggml_backend_tensor_set(inv_window_idx, inv_idx.data(), 0, ggml_nbytes(inv_window_idx));
+                ggml_backend_tensor_set(window_mask,    mask.data(),    0, ggml_nbytes(window_mask));
+            } else {
+                std::iota(idx.begin(), idx.end(), 0);
+                std::iota(inv_idx.begin(), inv_idx.end(), 0);
+            }
+
+            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
+            const int mpow = merge_ratio * merge_ratio;
+            std::vector<int> positions_data(ggml_nelements(positions));
+            int * data = positions_data.data();
 
             int ptr = 0;
-            for (int y = 0; y < ph; y+=2)
+            for (int y = 0; y < iph; y += merge_ratio)
             {
-                for (int x = 0; x < pw; x+=2)
+                for (int x = 0; x < ipw; x += merge_ratio)
                 {
                     for (int dy = 0; dy < 2; dy++) {
                         for (int dx = 0; dx < 2; dx++) {
-                            positions_data[ptr]                 = y + dy;
-                            positions_data[num_patches + ptr]     = x + dx;
-                            positions_data[num_patches * 2 + ptr] = y + dy;
-                            positions_data[num_patches * 3 + ptr] = x + dx;
+                            auto remap = idx[ptr / mpow];
+                            remap = remap * mpow + (ptr % mpow);
+
+                            data[                  remap] = y + dy;
+                            data[    num_patches + remap] = x + dx;
+                            data[2 * num_patches + remap] = y + dy;
+                            data[3 * num_patches + remap] = x + dx;
                             ptr++;
                         }
                     }
                 }
             }
 
-            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
-            free(positions_data);
+            ggml_backend_tensor_set(positions, data, 0, ggml_nbytes(positions));
         }
         else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
             // do nothing
@@ -2967,6 +3279,65 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         }
     }
 
+    if (use_window_attn && ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
+        struct ggml_tensor * window_idx = ggml_graph_get_tensor(gf, "window_idx");
+        struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx");
+        struct ggml_tensor * window_mask = ggml_graph_get_tensor(gf, "window_mask");
+
+        const int merge_ratio = 2;
+        const int attn_window_size = 112;
+        const int pw = image_size_width / patch_size / merge_ratio;
+        const int ph = image_size_height / patch_size / merge_ratio;
+        const int grid_window = attn_window_size / patch_size / merge_ratio;
+        const int ipw = image_size_width / patch_size;
+        const int iph = image_size_height / patch_size;
+        /*
+        pw * ph = number of tokens output by ViT after apply patch merger
+        ipw * ipw = number of vision token been processed inside ViT
+        */
+
+        std::vector<int> idx(ph * pw);
+        std::vector<int> inv_idx(ph * pw);
+        int dst = 0;
+        // [num_vision_tokens, num_vision_tokens] attention mask tensor
+        std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
+        int mask_row = 0;
+
+        for (int y = 0; y < ph; y+=grid_window)
+        {
+            for (int x = 0; x < pw; x+=grid_window)
+            {
+                const int win_h = std::min(grid_window, ph - y);
+                const int win_w = std::min(grid_window, pw - x);
+                const int dst_0 = dst;
+                // group all tokens belong to the same window togather (to a continue range)
+                for (int dy = 0; dy < win_h; dy++) {
+                    for (int dx = 0; dx < win_w; dx++) {
+                        const int src = (y + dy) * pw + (x + dx);
+                        assert(src < (int)idx.size());
+                        assert(dst < (int)inv_idx.size());
+                        idx[src] = dst;
+                        inv_idx[dst] = src;
+                        dst++;
+                    }
+                }
+
+                for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
+                    int row_offset = mask_row * (ipw * iph);
+                    std::fill(
+                        mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
+                        mask.begin() + row_offset + (dst   * merge_ratio * merge_ratio),
+                        0.0);
+                    mask_row++;
+                }
+            }
+        }
+
+        ggml_backend_tensor_set(window_idx, idx.data(), 0, ggml_nbytes(window_idx));
+        ggml_backend_tensor_set(inv_window_idx, inv_idx.data(), 0, ggml_nbytes(inv_window_idx));
+        ggml_backend_tensor_set(window_mask, mask.data(), 0, ggml_nbytes(window_mask));
+    }
+
     ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
 
     auto status = ggml_backend_sched_graph_compute(ctx->sched.get(), gf);
@@ -3142,6 +3513,7 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
         case PROJECTOR_TYPE_GLM_EDGE:
             return ctx->vision_model.mm_model_mlp_3_w->ne[1];
         case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
             return ctx->vision_model.mm_1_b->ne[0];
         case PROJECTOR_TYPE_GEMMA3:
             return ctx->vision_model.mm_input_proj_w->ne[0];
diff --git a/examples/llava/qwen2_vl_surgery.py b/examples/llava/qwen2_vl_surgery.py
index c87606b4fdf4f..7951a6fa8951e 100644
--- a/examples/llava/qwen2_vl_surgery.py
+++ b/examples/llava/qwen2_vl_surgery.py
@@ -1,14 +1,16 @@
 import argparse
-from typing import Dict
+from typing import Dict, List, Optional
 
 import torch
 import numpy as np
 from gguf import *
 from transformers import (
-    Qwen2VLForConditionalGeneration,
-    Qwen2VLProcessor,
     AutoProcessor,
-    Qwen2VLConfig
+    Qwen2VLConfig,
+    Qwen2VLProcessor,
+    Qwen2VLForConditionalGeneration,
+    Qwen2_5_VLConfig, # type: ignore[reportAttributeAccessIssue]
+    Qwen2_5_VLForConditionalGeneration, # type: ignore[reportAttributeAccessIssue]
 )
 
 
@@ -19,61 +21,93 @@ def k(raw_key: str, arch: str) -> str:
     return raw_key.format(arch=arch)
 
 
-def to_gguf_name(name: str) -> str:
-    og = name
-    name = name.replace("text_model", "t").replace("vision_model", "v")
-    name = name.replace("blocks", "blk").replace("embeddings.", "")
-    name = name.replace("attn.", "attn_")
-    name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
-    # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
-    name = name.replace("norm1", "ln1").replace("norm2", "ln2")
-    name = name.replace("merger.mlp", 'mm')
-    print(f"[to_gguf_name] {og} --> {name}")
-    return name
-
-
-def find_vision_tensors(qwen2vl, dtype) -> Dict[str, np.ndarray]:
-    vision_model = qwen2vl.visual
-    tensor_map = {}
-    for name, ten in vision_model.state_dict().items():
-        ten = ten.numpy()
-        if 'qkv' in name:
-            if ten.ndim == 2: # weight
-                c3, _ = ten.shape
-            else:             # bias
-                c3 = ten.shape[0]
-            assert c3 % 3 == 0
-            c = c3 // 3
-            wq = ten[:c]
-            wk = ten[c: c * 2]
-            wv = ten[c * 2:]
-            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
-            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
-            tensor_map[to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
-        elif 'merger' in name:
-            if name.endswith("ln_q.weight"):
-                tensor_map['v.post_ln.weight'] = ten
-            elif name.endswith("ln_q.bias"):
-                tensor_map['v.post_ln.bias'] = ten
+def get_n_wa_pattern(fullatt_block_indexes: Optional[List[int]]):
+    if fullatt_block_indexes is None:
+        return 0
+    n_wa = fullatt_block_indexes[0]
+    for a, b in zip(fullatt_block_indexes, fullatt_block_indexes[1:]):
+        if b - a - 1 != n_wa:
+            raise ValueError(
+                f"window/full attention layer should have fix pattern of "
+                f"for each full-attention layer followed by {n_wa} window-attention layers"
+            )
+    return n_wa + 1
+
+
+class VL2:
+
+    @staticmethod
+    def to_gguf_name(name: str) -> str:
+        og = name
+        name = name.replace("text_model", "t").replace("vision_model", "v")
+        name = name.replace("blocks", "blk").replace("embeddings.", "")
+        name = name.replace("attn.", "attn_")
+        name = name.replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("proj.", "out.")
+        # name = name.replace("layrnorm", "ln").replace("layer_norm", "ln").replace("layernorm", "ln")
+        name = name.replace("norm1", "ln1").replace("norm2", "ln2")
+        name = name.replace("merger.mlp", 'mm')
+        print(f"[to_gguf_name] {og} --> {name}")
+        return name
+
+    @classmethod
+    def find_vision_tensors(cls, qwen2vl, dtype) -> Dict[str, np.ndarray]:
+        vision_model = qwen2vl.visual
+        tensor_map = {}
+        for name, ten in vision_model.state_dict().items():
+            ten = ten.numpy()
+            if 'qkv' in name:
+                if ten.ndim == 2: # weight
+                    c3, _ = ten.shape
+                else:             # bias
+                    c3 = ten.shape[0]
+                assert c3 % 3 == 0
+                c = c3 // 3
+                wq = ten[:c]
+                wk = ten[c: c * 2]
+                wv = ten[c * 2:]
+                tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "q")] = wq
+                tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "k")] = wk
+                tensor_map[cls.to_gguf_name(f"vision_model.{name}").replace("qkv", "v")] = wv
+            elif 'merger' in name:
+                if name.endswith("ln_q.weight"):
+                    tensor_map['v.post_ln.weight'] = ten
+                elif name.endswith("ln_q.bias"):
+                    tensor_map['v.post_ln.bias'] = ten
+                else:
+                    # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
+                    tensor_map[cls.to_gguf_name(name)] = ten
+            elif 'patch_embed.proj.weight' in name:
+                # NOTE: split Conv3D into Conv2Ds
+                c1, c2, kt, kh, kw = ten.shape
+                assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
+                tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
+                tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
             else:
-                # "merger.mlp.%d.weight/bias" --> "mm.%d.weight/bias"
-                tensor_map[to_gguf_name(name)] = ten
-        elif 'patch_embed.proj.weight' in name:
-            # NOTE: split Conv3D into Conv2Ds
-            c1, c2, kt, kh, kw = ten.shape
-            assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
-            tensor_map["v.patch_embd.weight"] = ten[:, :, 0, ...]
-            tensor_map["v.patch_embd.weight.1"] = ten[:, :, 1, ...]
-        else:
-            tensor_map[to_gguf_name(f"vision_model.{name}")] = ten
-
-    for new_name, ten in tensor_map.items():
-        if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
-            tensor_map[new_name] = ten.astype(np.float32)
-        else:
-            tensor_map[new_name] = ten.astype(dtype)
-    tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32)  # dummy tensor, just here as a placeholder
-    return tensor_map
+                tensor_map[cls.to_gguf_name(f"vision_model.{name}")] = ten
+
+        for new_name, ten in tensor_map.items():
+            if ten.ndim <= 1 or new_name.endswith("_norm.weight"):
+                tensor_map[new_name] = ten.astype(np.float32)
+            else:
+                tensor_map[new_name] = ten.astype(dtype)
+        tensor_map["v.position_embd.weight"] = np.zeros([10, 10], dtype=np.float32)  # dummy tensor, just here as a placeholder
+        return tensor_map
+
+
+class VL25(VL2):
+
+    @staticmethod
+    def to_gguf_name(name: str) -> str:
+        og = name
+        name = name.replace("text_model", "t").replace("vision_model", "v")
+        name = name.replace("blocks", "blk").replace("embeddings.", "")
+        name = name.replace("attn.", "attn_")
+        name = name.replace("mlp.down_proj", "ffn_down").replace("mlp.up_proj", "ffn_up")
+        name = name.replace("mlp.gate_proj", "ffn_gate").replace("proj.", "out.")
+        name = name.replace("norm1", "ln1").replace("norm2", "ln2")
+        name = name.replace("merger.mlp", 'mm')
+        print(f"[vl25][to_gguf_name] {og} --> {name}")
+        return name
 
 
 def main(args):
@@ -82,7 +116,7 @@ def main(args):
         np_dtype = np.float32
         ftype = 0
     elif args.data_type == 'fp16':
-        dtype = torch.float32
+        dtype = torch.float16
         np_dtype = np.float16
         ftype = 1
     else:
@@ -92,11 +126,18 @@ def main(args):
     model_path = ""
     model_name = args.model_name
     print("model_name: ", model_name)
-    qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
-        model_name, torch_dtype=dtype, device_map="cpu"
-    )
-    cfg: Qwen2VLConfig = qwen2vl.config  # type: ignore[reportAssignmentType]
-    vcfg = cfg.vision_config
+    if args.model_type == "qwen2vl":
+        qwen2vl = Qwen2VLForConditionalGeneration.from_pretrained(
+            model_name, torch_dtype=dtype, device_map="cpu"
+        )
+        cfg: Qwen2VLConfig = qwen2vl.config  # type: ignore[reportAssignmentType]
+        vcfg = cfg.vision_config
+    else:
+        qwen2vl = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_name, torch_dtype=dtype, device_map="cpu"
+        )
+        cfg: Qwen2_5_VLConfig = qwen2vl.config  # type: ignore[reportAssignmentType]
+        vcfg = cfg.vision_config
 
     if os.path.isdir(model_name):
         local_model = True
@@ -113,7 +154,6 @@ def main(args):
     fout.add_bool("clip.has_text_encoder", False)
     fout.add_bool("clip.has_vision_encoder", True)
     fout.add_bool("clip.has_qwen2vl_merger", True)
-    fout.add_string("clip.projector_type", "qwen2vl_merger")
 
     print(cfg.vision_config)
     if 'silu' in cfg.vision_config.hidden_act.lower():
@@ -125,14 +165,25 @@ def main(args):
     else:
         raise ValueError()
 
-    tensor_map = find_vision_tensors(qwen2vl, np_dtype)
+    if args.model_type == "qwen2.5vl":
+        fout.add_uint32("clip.vision.n_wa_pattern", get_n_wa_pattern(vcfg.fullatt_block_indexes))
+        fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.hidden_size)
+        fout.add_uint32("clip.vision.projection_dim", vcfg.out_hidden_size)
+        fout.add_string("clip.projector_type", "qwen2.5vl_merger")
+    else:
+        fout.add_string("clip.projector_type", "qwen2vl_merger")
+        fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
+        fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
+
+    if args.model_type == "qwen2.5vl":
+        tensor_map = VL25.find_vision_tensors(qwen2vl, np_dtype)
+    else:
+        tensor_map = VL2.find_vision_tensors(qwen2vl, np_dtype)
     for name, data in tensor_map.items():
         fout.add_tensor(name, data)
 
     fout.add_uint32("clip.vision.patch_size", vcfg.patch_size)
     fout.add_uint32("clip.vision.image_size", 14 * 40)  # some reasonable size that is divable by (14*2)
-    fout.add_uint32(k(KEY_EMBEDDING_LENGTH, VISION), vcfg.embed_dim)
-    fout.add_uint32("clip.vision.projection_dim", vcfg.hidden_size)
     fout.add_uint32(k(KEY_ATTENTION_HEAD_COUNT, VISION), vcfg.num_heads)
     fout.add_float32(k(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
     fout.add_uint32(k(KEY_BLOCK_COUNT, VISION), vcfg.depth)
@@ -160,6 +211,7 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("model_name", nargs='?', default="Qwen/Qwen2-VL-2B-Instruct")
+    parser.add_argument("--model_type", nargs='?', choices=['qwen2vl', 'qwen2.5vl'], default="qwen2vl")
     parser.add_argument("--data_type", nargs='?', choices=['fp32', 'fp16'], default="fp32")
     args = parser.parse_args()
     main(args)
diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp
index eca7b7f10b9e3..cf42710869191 100644
--- a/examples/llava/qwen2vl-cli.cpp
+++ b/examples/llava/qwen2vl-cli.cpp
@@ -23,6 +23,9 @@
 #include <algorithm>
 #include <iostream>
 #include <fstream>
+#include <limits>
+#include <cassert>
+#include <cmath>
 
 
 static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
@@ -367,14 +370,14 @@ static void debug_test_mrope_2d() {
     // 1. Initialize backend
     ggml_backend_t backend = NULL;
     std::string backend_name = "";
-#ifdef GGML_USE_CUDA
-    fprintf(stderr, "%s: using CUDA backend\n", __func__);
-    backend = ggml_backend_cuda_init(0); // init device 0
-    backend_name = "cuda";
-    if (!backend) {
-        fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
-    }
-#endif
+// #ifdef GGML_USE_CUDA
+//     fprintf(stderr, "%s: using CUDA backend\n", __func__);
+//     backend = ggml_backend_cuda_init(0); // init device 0
+//     backend_name = "cuda";
+//     if (!backend) {
+//         fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+//     }
+// #endif
     // if there aren't GPU Backends fallback to CPU backend
     if (!backend) {
         backend = ggml_backend_cpu_init();
@@ -483,28 +486,82 @@ static void debug_test_mrope_2d() {
     ggml_backend_free(backend);
 }
 
-static void debug_dump_img_embed(struct llava_context * ctx_llava) {
-    int n_embd  = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
-    int ne = n_embd * 4;
-    float vals[56 * 56 * 3];
+enum model_output_type {
+    conv3d,
+    patch_embed,
+    patch_win_attn_scatter,
+    first_attn_layer,
+    last_attn_layer,
+    attn_softmax,
+    final_layer,
+};
+
+static void debug_dump_img_embed(struct llava_context * ctx_llava, model_output_type output_type) {
+    constexpr int ih = 140;
+    constexpr int iw = 196;
+    // constexpr int ih = 56;
+    // constexpr int iw = 56;
+    // int n_embd  = llama_model_n_embd(llama_get_model(ctx_llava->ctx_llama));
+    int n_embd  = 1280;
+    int merge = 1;
+    if (output_type == model_output_type::final_layer) {
+        n_embd  = 2048;
+        merge = 2;
+    }
+    else if (output_type == model_output_type::attn_softmax) {
+        merge = 1;
+        n_embd = (ih/14/merge) * (iw/14/merge) * 16;
+    }
+
+    int ne = (ih/14/merge) * (iw/14/merge) * n_embd;
+    float vals[iw * ih * 3];
     // float embd[ne];
     std::vector<float> embd;
     embd.resize(ne);
 
-    for (int i = 0; i < 56*56; i++)
+    for (int i = 0; i < iw*ih; i++)
     {
         for (int c = 0; c < 3; c++)
-            vals[i * 3 + c] = (float)(i % (56 * 56)) / (56*56);
+            vals[i * 3 + c] = (float)i / (iw*ih);
     }
 
-    clip_encode_float_image(ctx_llava->ctx_clip, 16, vals, 56, 56, embd.data());
+    clip_encode_float_image(ctx_llava->ctx_clip, 8, vals, ih, iw, embd.data());
+
+    std::string file_postfix = "";
+    switch (output_type)
+    {
+    case model_output_type::conv3d:
+        file_postfix = "conv3d";
+        break;
+    case model_output_type::patch_embed:
+        file_postfix = "patch_embed";
+        break;
+    case model_output_type::patch_win_attn_scatter:
+        file_postfix = "scatter";
+        break;
+    case model_output_type::first_attn_layer:
+        file_postfix = "first_attn";
+        break;
+    case model_output_type::last_attn_layer:
+        file_postfix = "last_attn";
+        break;
+    case model_output_type::attn_softmax:
+        file_postfix = "attn_softmax";
+        break;
+    case model_output_type::final_layer:
+        file_postfix = "final";
+        break;
+    default:
+        break;
+    }
+    auto output_path = "img_embed_" + file_postfix + ".bin";
 
-    std::ofstream outFile("img_embed.bin", std::ios::binary);
+    std::ofstream outFile(output_path, std::ios::binary);
     if (outFile.is_open()) {
         outFile.write(reinterpret_cast<const char*>(embd.data()), ne * sizeof(float));
 
         outFile.close();
-        std::cout << "Data successfully written to mrope.bin" << std::endl;
+        std::cout << "Data successfully written to ::[ " << output_path << std::endl;
     } else {
         std::cerr << "Error opening file!" << std::endl;
     }
@@ -551,8 +608,9 @@ int main(int argc, char ** argv) {
     } else if (params.image[0].empty()) {
         auto ctx_llava = llava_init_context(&params, model);
 
-        debug_test_mrope_2d();
-        debug_dump_img_embed(ctx_llava);
+        // debug_test_mrope_2d();
+        debug_dump_img_embed(ctx_llava, model_output_type::final_layer);
+        // debug_dump_img_embed(ctx_llava, model_output_type::last_attn_layer);
 
         llama_perf_context_print(ctx_llava->ctx_llama);
         ctx_llava->model = NULL;
diff --git a/examples/llava/tests.sh b/examples/llava/tests.sh
index e612857edc55d..4002f9d531bd2 100755
--- a/examples/llava/tests.sh
+++ b/examples/llava/tests.sh
@@ -55,6 +55,7 @@ add_test "llama-mtmd-cli"  "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # mode
 add_test "llama-mtmd-cli"  "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
 add_test "llama-mtmd-cli"  "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
 add_test "llama-qwen2vl-cli"  "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
+add_test "llama-qwen2vl-cli"  "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
 
 # to test the big models, run: ./tests.sh big
 add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"

From 59e991c23cc44cb5fb657e7b9358cac21fb79828 Mon Sep 17 00:00:00 2001
From: LostRuins Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sun, 27 Apr 2025 18:43:37 +0800
Subject: [PATCH 039/200] Fixes Qwen2.5VL segfault during inference with
 https://github.com/ggml-org/llama.cpp/pull/12402 as has_qwen2vl_merger
 migration was incomplete (#13133)

---
 examples/llava/clip.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index b6a1f40e8a580..3cd27d5b17a08 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1718,7 +1718,8 @@ struct clip_model_loader {
 
                 if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV
                         || ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE
-                        || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL) {
+                        || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL
+                        || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) {
                     n_layer += 1;
                 }
 
@@ -2744,7 +2745,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
         }
         return true;
     }
-    else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
+    else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
         clip_image_u8 resized;
         auto patch_size = clip_get_patch_size(ctx) * 2;
         int nx = ceil((float)img->nx / patch_size) * patch_size;
@@ -3139,7 +3140,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     else {
         // non-minicpmv models
 
-        if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
+        if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
             // pw * ph = number of tokens output by ViT after apply patch merger
             // ipw * ipw = number of vision token been processed inside ViT
             const int merge_ratio = 2;
@@ -3279,7 +3280,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
         }
     }
 
-    if (use_window_attn && ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
+    if (use_window_attn && (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL)) {
         struct ggml_tensor * window_idx = ggml_graph_get_tensor(gf, "window_idx");
         struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx");
         struct ggml_tensor * window_mask = ggml_graph_get_tensor(gf, "window_mask");

From e291450b7602d7a36239e4ceeece37625f838373 Mon Sep 17 00:00:00 2001
From: R0CKSTAR <xiaodong.ye@mthreads.com>
Date: Sun, 27 Apr 2025 19:22:49 +0800
Subject: [PATCH 040/200] musa: fix build warning (#13129)

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 ggml/src/ggml-cuda/cpy.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu
index ed25646e8e261..2d46176eab344 100644
--- a/ggml/src/ggml-cuda/cpy.cu
+++ b/ggml/src/ggml-cuda/cpy.cu
@@ -639,6 +639,8 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
     if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
         ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
     }
+#else
+    GGML_UNUSED(disable_indirection_for_this_node);
 #endif
 
 }

From ced44be34290fab450f8344efa047d8a08e723b4 Mon Sep 17 00:00:00 2001
From: matteo <matteo.serva@gmail.com>
Date: Sun, 27 Apr 2025 21:57:32 +0200
Subject: [PATCH 041/200] llama-chat : fix wrong template in GLM4-0414 (#13140)

* fix wrong template in GLM4-0414

* fix spaces

* no bos token since it is already in the template

* moved the chatgml4 check to higher priority

* restored template for old GLM models

* moved the GLM4 template check in the correct place with correct check
---
 convert_hf_to_gguf.py | 2 +-
 src/llama-chat.cpp    | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index ea3a951b93753..d4fec408dd202 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5154,7 +5154,7 @@ def set_vocab(self):
         special_vocab._set_special_token("eos", tokenizer.get_added_vocab()["<|endoftext|>"])
         special_vocab._set_special_token("eot", tokenizer.get_added_vocab()["<|user|>"])
         special_vocab._set_special_token("unk", tokenizer.get_added_vocab()["<|endoftext|>"])
-        special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["[gMASK]"])
+        special_vocab._set_special_token("bos", tokenizer.get_added_vocab()["<|endoftext|>"])
         special_vocab.add_to_gguf(self.gguf_writer)
 
     def set_gguf_parameters(self):
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
index 41f89e3a9d3bd..698c30ce49710 100644
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -122,6 +122,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
         }
     } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
         return LLM_CHAT_TEMPLATE_PHI_3;
+    } else if (tmpl_contains("[gMASK]<sop>")) {
+        return LLM_CHAT_TEMPLATE_CHATGML_4;
     } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
         return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
     } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
@@ -155,8 +157,6 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
     } else if (tmpl_contains("[gMASK]sop")) {
         // chatglm3-6b
         return LLM_CHAT_TEMPLATE_CHATGML_3;
-    } else if (tmpl_contains("[gMASK]<sop>")) {
-        return LLM_CHAT_TEMPLATE_CHATGML_4;
     } else if (tmpl_contains(LU8("<用户>"))) {
         // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
         return LLM_CHAT_TEMPLATE_MINICPM;

From c0a97b762e5ec767dc414f0dc4979befd4c09a52 Mon Sep 17 00:00:00 2001
From: 4onen <11580688+4onen@users.noreply.github.com>
Date: Sun, 27 Apr 2025 14:48:26 -0700
Subject: [PATCH 042/200] llama-bench : Add `--override-tensors` arg (#12922)

* Add --override-tensors option to llama-bench

* Correct llama-bench --override-tensors to --override-tensor

* llama-bench: Update --override-tensors parsing to match --tensor-split, appear in test matrix.

* Make new llama-bench util functions static to fix Ubuntu CI

* llama-bench: Correct -ot corner cases (No -ot calls, leading and trailing empty -ot spans, etc.)
---
 examples/llama-bench/llama-bench.cpp | 177 ++++++++++++++++++++++++++-
 1 file changed, 173 insertions(+), 4 deletions(-)

diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index cbcbfcee861ee..564a51bfd7b6c 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -36,6 +36,46 @@ static uint64_t get_time_ns() {
     return std::chrono::nanoseconds(clock::now().time_since_epoch()).count();
 }
 
+static bool tensor_buft_override_equal(const llama_model_tensor_buft_override& a, const llama_model_tensor_buft_override& b) {
+    if (a.pattern != b.pattern) {
+        // cString comparison that may be null
+        if (a.pattern == nullptr || b.pattern == nullptr) {
+            return false;
+        }
+        if (strcmp(a.pattern, b.pattern) != 0) {
+            return false;
+        }
+    }
+    if (a.buft != b.buft) {
+        return false;
+    }
+    return true;
+}
+
+static bool vec_tensor_buft_override_equal(const std::vector<llama_model_tensor_buft_override>& a, const std::vector<llama_model_tensor_buft_override>& b) {
+    if (a.size() != b.size()) {
+        return false;
+    }
+    for (size_t i = 0; i < a.size(); i++) {
+        if (!tensor_buft_override_equal(a[i], b[i])) {
+            return false;
+        }
+    }
+    return true;
+}
+
+static bool vec_vec_tensor_buft_override_equal(const std::vector<std::vector<llama_model_tensor_buft_override>>& a, const std::vector<std::vector<llama_model_tensor_buft_override>>& b) {
+    if (a.size() != b.size()) {
+        return false;
+    }
+    for (size_t i = 0; i < a.size(); i++) {
+        if (!vec_tensor_buft_override_equal(a[i], b[i])) {
+            return false;
+        }
+    }
+    return true;
+}
+
 template <class T> static std::string join(const std::vector<T> & values, const std::string & delim) {
     std::ostringstream str;
     for (size_t i = 0; i < values.size(); i++) {
@@ -175,6 +215,7 @@ struct cmd_params {
     std::vector<bool>                no_kv_offload;
     std::vector<bool>                flash_attn;
     std::vector<std::vector<float>>  tensor_split;
+    std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
     std::vector<bool>                use_mmap;
     std::vector<bool>                embeddings;
     ggml_numa_strategy               numa;
@@ -207,6 +248,7 @@ static const cmd_params cmd_params_defaults = {
     /* no_kv_offload        */ { false },
     /* flash_attn           */ { false },
     /* tensor_split         */ { std::vector<float>(llama_max_devices(), 0.0f) },
+    /* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{{nullptr,nullptr}} },
     /* use_mmap             */ { true },
     /* embeddings           */ { false },
     /* numa                 */ GGML_NUMA_STRATEGY_DISABLED,
@@ -265,6 +307,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -embd, --embeddings <0|1>                 (default: %s)\n",
            join(cmd_params_defaults.embeddings, ",").c_str());
     printf("  -ts, --tensor-split <ts0/ts1/..>          (default: 0)\n");
+    printf("  -ot --override-tensors <tensor name pattern>=<buffer type>;... (default: disabled)\n");
     printf("  -r, --repetitions <n>                     (default: %d)\n", cmd_params_defaults.reps);
     printf("  --prio <0|1|2|3>                          (default: %d)\n", cmd_params_defaults.prio);
     printf("  --delay <0...N> (seconds)                 (default: %d)\n", cmd_params_defaults.delay);
@@ -557,6 +600,87 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 }
                 params.tensor_split.push_back(tensor_split);
             }
+        } else if (arg == "-ot" || arg == "--override-tensor") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto value = argv[i];
+            /* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
+            if (buft_list.empty()) {
+                // enumerate all the devices and add their buffer types to the list
+                for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+                    auto * dev = ggml_backend_dev_get(i);
+                    auto * buft = ggml_backend_dev_buffer_type(dev);
+                    if (buft) {
+                        buft_list[ggml_backend_buft_name(buft)] = buft;
+                    }
+                }
+            }
+            auto override_group_span_len = std::strcspn(value, ",");
+            bool last_group = false;
+            do {
+                if (override_group_span_len == 0) {
+                    // Adds an empty override-tensors for an empty span
+                    params.tensor_buft_overrides.push_back({{}});
+                    if (value[override_group_span_len] == '\0') {
+                        value = &value[override_group_span_len];
+                        last_group = true;
+                    } else {
+                        value = &value[override_group_span_len + 1];
+                        override_group_span_len = std::strcspn(value, ",");
+                    }
+                    continue;
+                }
+                // Stamps null terminators into the argv
+                // value for this option to avoid the
+                // memory leak present in the implementation
+                // over in arg.cpp. Acceptable because we
+                // only parse these args once in this program.
+                auto override_group = value;
+                if (value[override_group_span_len] == '\0') {
+                    value = &value[override_group_span_len];
+                    last_group = true;
+                } else {
+                    value[override_group_span_len] = '\0';
+                    value = &value[override_group_span_len + 1];
+                }
+                std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
+                auto override_span_len = std::strcspn(override_group, ";");
+                while (override_span_len > 0) {
+                    auto override = override_group;
+                    if (override_group[override_span_len] != '\0') {
+                        override_group[override_span_len] = '\0';
+                        override_group = &override_group[override_span_len + 1];
+                    } else {
+                        override_group = &override_group[override_span_len];
+                    }
+                    auto tensor_name_span_len = std::strcspn(override, "=");
+                    if (tensor_name_span_len >= override_span_len) {
+                        invalid_param = true;
+                        break;
+                    }
+                    override[tensor_name_span_len] = '\0';
+                    auto tensor_name = override;
+                    auto buffer_type = &override[tensor_name_span_len + 1];
+                    if (buft_list.find(buffer_type) == buft_list.end()) {
+                        printf("Available buffer types:\n");
+                        for (const auto & it : buft_list) {
+                            printf("  %s\n", ggml_backend_buft_name(it.second));
+                        }
+                        invalid_param = true;
+                        break;
+                    }
+                    group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
+                    override_span_len = std::strcspn(override_group, ";");
+                }
+                if (invalid_param) {
+                    break;
+                }
+                group_tensor_buft_overrides.push_back({nullptr,nullptr});
+                params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
+                override_group_span_len = std::strcspn(value, ",");
+            } while (!last_group);
         } else if (arg == "-r" || arg == "--repetitions") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -648,6 +772,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     if (params.tensor_split.empty()) {
         params.tensor_split = cmd_params_defaults.tensor_split;
     }
+    if (params.tensor_buft_overrides.empty()) {
+        params.tensor_buft_overrides = cmd_params_defaults.tensor_buft_overrides;
+    }
     if (params.use_mmap.empty()) {
         params.use_mmap = cmd_params_defaults.use_mmap;
     }
@@ -689,6 +816,7 @@ struct cmd_params_instance {
     bool               no_kv_offload;
     bool               flash_attn;
     std::vector<float> tensor_split;
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
     bool               use_mmap;
     bool               embeddings;
 
@@ -733,13 +861,20 @@ struct cmd_params_instance {
         mparams.tensor_split = tensor_split.data();
         mparams.use_mmap     = use_mmap;
 
+        if (tensor_buft_overrides.empty()) {
+            mparams.tensor_buft_overrides = nullptr;
+        } else {
+            GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern");
+            mparams.tensor_buft_overrides = tensor_buft_overrides.data();
+        }
+
         return mparams;
     }
 
     bool equal_mparams(const cmd_params_instance & other) const {
         return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
                split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
-               tensor_split == other.tensor_split;
+               tensor_split == other.tensor_split && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
     }
 
     llama_context_params to_llama_cparams() const {
@@ -769,6 +904,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
     for (const auto & sm : params.split_mode)
     for (const auto & mg : params.main_gpu)
     for (const auto & ts : params.tensor_split)
+    for (const auto & ot : params.tensor_buft_overrides)
     for (const auto & mmp : params.use_mmap)
     for (const auto & embd : params.embeddings)
     for (const auto & nb : params.n_batch)
@@ -804,6 +940,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .no_kv_offload= */ nkvo,
                 /* .flash_attn   = */ fa,
                 /* .tensor_split = */ ts,
+                /* .tensor_buft_overrides = */ ot,
                 /* .use_mmap     = */ mmp,
                 /* .embeddings   = */ embd,
             };
@@ -833,6 +970,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .no_kv_offload= */ nkvo,
                 /* .flash_attn   = */ fa,
                 /* .tensor_split = */ ts,
+                /* .tensor_buft_overrides = */ ot,
                 /* .use_mmap     = */ mmp,
                 /* .embeddings   = */ embd,
             };
@@ -862,6 +1000,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .no_kv_offload= */ nkvo,
                 /* .flash_attn   = */ fa,
                 /* .tensor_split = */ ts,
+                /* .tensor_buft_overrides = */ ot,
                 /* .use_mmap     = */ mmp,
                 /* .embeddings   = */ embd,
             };
@@ -896,6 +1035,7 @@ struct test {
     bool                     no_kv_offload;
     bool                     flash_attn;
     std::vector<float>       tensor_split;
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
     bool                     use_mmap;
     bool                     embeddings;
     int                      n_prompt;
@@ -927,6 +1067,7 @@ struct test {
         no_kv_offload  = inst.no_kv_offload;
         flash_attn     = inst.flash_attn;
         tensor_split   = inst.tensor_split;
+        tensor_buft_overrides = inst.tensor_buft_overrides;
         use_mmap       = inst.use_mmap;
         embeddings     = inst.embeddings;
         n_prompt       = inst.n_prompt;
@@ -972,9 +1113,9 @@ struct test {
             "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
             "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
             "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
-            "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "use_mmap",
-            "embeddings",   "n_prompt",     "n_gen",          "test_time",  "avg_ns",       "stddev_ns",
-            "avg_ts",       "stddev_ts",
+            "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
+            "use_mmap",     "embeddings",   "n_prompt",       "n_gen",      "test_time",    "avg_ns",
+            "stddev_ns",    "avg_ts",       "stddev_ts",
         };
         return fields;
     }
@@ -1000,6 +1141,7 @@ struct test {
 
     std::vector<std::string> get_values() const {
         std::string tensor_split_str;
+        std::string tensor_buft_overrides_str;
         int         max_nonzero = 0;
         for (size_t i = 0; i < llama_max_devices(); i++) {
             if (tensor_split[i] > 0) {
@@ -1014,6 +1156,26 @@ struct test {
                 tensor_split_str += "/";
             }
         }
+        if (tensor_buft_overrides.size() == 1) {
+            // Last element of tensor_buft_overrides is always a null pattern
+            // so if it is only one element long, it must be a null pattern.
+            GGML_ASSERT(tensor_buft_overrides[0].pattern == nullptr);
+            tensor_buft_overrides_str += "none";
+        } else {
+            for (size_t i = 0; i < tensor_buft_overrides.size()-1; i++) {
+                // Last element of tensor_buft_overrides is always a null pattern
+                if (tensor_buft_overrides[i].pattern == nullptr) {
+                    tensor_buft_overrides_str += "none";
+                } else {
+                    tensor_buft_overrides_str += tensor_buft_overrides[i].pattern;
+                    tensor_buft_overrides_str += "=";
+                    tensor_buft_overrides_str += ggml_backend_buft_name(tensor_buft_overrides[i].buft);
+                }
+                if (i + 2 < tensor_buft_overrides.size()) {
+                    tensor_buft_overrides_str += ";";
+                }
+            }
+        }
         std::vector<std::string> values = { build_commit,
                                             std::to_string(build_number),
                                             cpu_info,
@@ -1037,6 +1199,7 @@ struct test {
                                             std::to_string(no_kv_offload),
                                             std::to_string(flash_attn),
                                             tensor_split_str,
+                                            tensor_buft_overrides_str,
                                             std::to_string(use_mmap),
                                             std::to_string(embeddings),
                                             std::to_string(n_prompt),
@@ -1254,6 +1417,9 @@ struct markdown_printer : public printer {
         if (field == "tensor_split") {
             return "ts";
         }
+        if (field == "tensor_buft_overrides") {
+            return "ot";
+        }
         return field;
     }
 
@@ -1307,6 +1473,9 @@ struct markdown_printer : public printer {
         if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
             fields.emplace_back("tensor_split");
         }
+        if (params.tensor_buft_overrides.size() > 1 || !vec_vec_tensor_buft_override_equal(params.tensor_buft_overrides, cmd_params_defaults.tensor_buft_overrides)) {
+            fields.emplace_back("tensor_buft_overrides");
+        }
         if (params.use_mmap.size() > 1 || params.use_mmap != cmd_params_defaults.use_mmap) {
             fields.emplace_back("use_mmap");
         }

From 85f36e5e7173eef7c671c778db44c034e1d0ab19 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Mon, 28 Apr 2025 07:16:59 +0200
Subject: [PATCH 043/200] arg : fix unused variable (#13142)

---
 common/arg.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index de173159f4a76..2740149212278 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -673,7 +673,7 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s
     return {};
 }
 
-std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params & params) {
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string &, const common_remote_params &) {
     throw std::runtime_error("error: built without CURL, cannot download model from the internet");
 }
 

From 69699be48a6b94570773532850667f1591dc5bbe Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= <johannesg@5d6.de>
Date: Mon, 28 Apr 2025 09:29:26 +0200
Subject: [PATCH 044/200] CUDA: fix q_nope_absorbed prec for DS 2 Lite f16
 (#13137)

---
 ggml/include/ggml.h             | 4 ++--
 ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++--
 src/llama-model.cpp             | 1 +
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 51aa5b3a0ab44..1b8603e78e553 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -393,8 +393,8 @@ extern "C" {
 
     // precision
     enum ggml_prec {
-        GGML_PREC_DEFAULT,
-        GGML_PREC_F32,
+        GGML_PREC_DEFAULT =  0, // stored as ggml_tensor.op_params, 0 by default
+        GGML_PREC_F32     = 10,
     };
 
     // model file types
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index e0e0d2137f3be..19b9ce7231aa2 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -1935,8 +1935,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
         ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
     } else if (!split && use_mul_mat_vec_q) {
         ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
-    } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
-               && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
+    } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
+            dst->op_params[0] == GGML_PREC_DEFAULT && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
         // general KQ + KQV multi-batch without FlashAttention
         ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
     } else if (use_mul_mat_vec) {
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 6b7bfecf3a1cf..df2791002e9f9 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -10149,6 +10149,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
 
                     // {n_embd_head_qk_nope, kv_lora_rank, n_head} x {n_embd_head_qk_nope, n_tokens, n_head}
                     ggml_tensor * q_nope_absorbed = ggml_mul_mat(ctx0, model.layers[il].wk_b, q_nope);
+                    ggml_mul_mat_set_prec(q_nope_absorbed, GGML_PREC_F32);
                     cb(q_nope_absorbed, "q_nope_absorbed", il);
 
                     // {kv_lora_rank, n_head, n_tokens}

From f0dd6a1926cdb2f4183a937deee40db26ef8f1da Mon Sep 17 00:00:00 2001
From: R0CKSTAR <xiaodong.ye@mthreads.com>
Date: Mon, 28 Apr 2025 15:33:28 +0800
Subject: [PATCH 045/200] musa: fix typo in cc control (#13144)

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
---
 ggml/src/ggml-cuda/common.cuh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 8284a0017d272..2ea014e6476a7 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -78,13 +78,13 @@
 // Moore Threads
 #define GGML_CUDA_MUSA_ARCH_IS_QY1 (__MUSA_ARCH__ <= 210)
 
-#define GGML_CUDA_CC_QY1  (GGML_MUSA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
-#define GGML_CUDA_CC_QY2  (GGML_MUSA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
-#define GGML_CUDA_CC_NG   (GGML_MUSA_CC_OFFSET_MTHREADS + 0x310) // TBD
+#define GGML_CUDA_CC_QY1  (GGML_CUDA_CC_OFFSET_MTHREADS + 0x210) // MTT S80, MTT S3000
+#define GGML_CUDA_CC_QY2  (GGML_CUDA_CC_OFFSET_MTHREADS + 0x220) // MTT S4000
+#define GGML_CUDA_CC_NG   (GGML_CUDA_CC_OFFSET_MTHREADS + 0x310) // TBD
 
 #define GGML_CUDA_CC_IS_MTHREADS(cc) (cc >= GGML_CUDA_CC_OFFSET_MTHREADS && cc < GGML_CUDA_CC_OFFSET_AMD)
 #define GGML_CUDA_CC_IS_QY1(cc)      (cc >= GGML_CUDA_CC_QY1 && cc < GGML_CUDA_CC_QY2)
-#define GGML_CUDA_CC_IS_QY2(cc)      (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NEXT)
+#define GGML_CUDA_CC_IS_QY2(cc)      (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG)
 #define GGML_CUDA_CC_IS_NG(cc)       (cc >= GGML_CUDA_CC_NG)
 
 #ifdef __CUDA_ARCH_LIST__

From e5d6c2554e7597665e26991a93fa2f3d16c79ad5 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Mon, 28 Apr 2025 10:11:58 +0200
Subject: [PATCH 046/200] llama-chat : fix typo GML --> GLM (#13143)

---
 src/llama-chat.cpp | 12 ++++++------
 src/llama-chat.h   |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
index 698c30ce49710..af5e2003198d8 100644
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
     { "deepseek3",         LLM_CHAT_TEMPLATE_DEEPSEEK_3        },
     { "command-r",         LLM_CHAT_TEMPLATE_COMMAND_R         },
     { "llama3",            LLM_CHAT_TEMPLATE_LLAMA_3           },
-    { "chatglm3",          LLM_CHAT_TEMPLATE_CHATGML_3         },
-    { "chatglm4",          LLM_CHAT_TEMPLATE_CHATGML_4         },
+    { "chatglm3",          LLM_CHAT_TEMPLATE_CHATGLM_3         },
+    { "chatglm4",          LLM_CHAT_TEMPLATE_CHATGLM_4         },
     { "glmedge",           LLM_CHAT_TEMPLATE_GLMEDGE           },
     { "minicpm",           LLM_CHAT_TEMPLATE_MINICPM           },
     { "exaone3",           LLM_CHAT_TEMPLATE_EXAONE_3          },
@@ -123,7 +123,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
     } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
         return LLM_CHAT_TEMPLATE_PHI_3;
     } else if (tmpl_contains("[gMASK]<sop>")) {
-        return LLM_CHAT_TEMPLATE_CHATGML_4;
+        return LLM_CHAT_TEMPLATE_CHATGLM_4;
     } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
         return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
     } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
@@ -156,7 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
         return LLM_CHAT_TEMPLATE_LLAMA_3;
     } else if (tmpl_contains("[gMASK]sop")) {
         // chatglm3-6b
-        return LLM_CHAT_TEMPLATE_CHATGML_3;
+        return LLM_CHAT_TEMPLATE_CHATGLM_3;
     } else if (tmpl_contains(LU8("<用户>"))) {
         // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
         return LLM_CHAT_TEMPLATE_MINICPM;
@@ -437,7 +437,7 @@ int32_t llm_chat_apply_template(
         if (add_ass) {
             ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
         }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
         // chatglm3-6b
         ss << "[gMASK]" << "sop";
         for (auto message : chat) {
@@ -447,7 +447,7 @@ int32_t llm_chat_apply_template(
         if (add_ass) {
             ss << "<|assistant|>";
         }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
         ss << "[gMASK]" << "<sop>";
         for (auto message : chat) {
             std::string role(message->role);
diff --git a/src/llama-chat.h b/src/llama-chat.h
index dc30df711a96e..3f5843466d044 100644
--- a/src/llama-chat.h
+++ b/src/llama-chat.h
@@ -29,8 +29,8 @@ enum llm_chat_template {
     LLM_CHAT_TEMPLATE_DEEPSEEK_3,
     LLM_CHAT_TEMPLATE_COMMAND_R,
     LLM_CHAT_TEMPLATE_LLAMA_3,
-    LLM_CHAT_TEMPLATE_CHATGML_3,
-    LLM_CHAT_TEMPLATE_CHATGML_4,
+    LLM_CHAT_TEMPLATE_CHATGLM_3,
+    LLM_CHAT_TEMPLATE_CHATGLM_4,
     LLM_CHAT_TEMPLATE_GLMEDGE,
     LLM_CHAT_TEMPLATE_MINICPM,
     LLM_CHAT_TEMPLATE_EXAONE_3,

From 43f2b07193cbcccd266734320ea9b948f5a01926 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 28 Apr 2025 11:57:19 +0300
Subject: [PATCH 047/200] common : fix noreturn compile warning (#13151)

ggml-ci
---
 common/arg.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/common/arg.cpp b/common/arg.cpp
index 2740149212278..75e8e0bd51aee 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -673,8 +673,12 @@ static struct common_hf_file_res common_get_hf_file(const std::string &, const s
     return {};
 }
 
-std::pair<long, std::vector<char>> common_remote_get_content(const std::string &, const common_remote_params &) {
-    throw std::runtime_error("error: built without CURL, cannot download model from the internet");
+std::pair<long, std::vector<char>> common_remote_get_content(const std::string & url, const common_remote_params &) {
+    if (!url.empty()) {
+        throw std::runtime_error("error: built without CURL, cannot download model from the internet");
+    }
+
+    return {};
 }
 
 #endif // LLAMA_USE_CURL

From d0a417f3c7a5a22ef05b3b76d91dbe1d3362bf0c Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Mon, 28 Apr 2025 12:10:18 +0300
Subject: [PATCH 048/200] readme : update hot topics (#13150)

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index a0e7bd2d213ed..1785493c3e2b0 100644
--- a/README.md
+++ b/README.md
@@ -16,9 +16,9 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 
 ## Hot topics
 
+- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
 - A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated
-- **How to use [MTLResidencySet](https://developer.apple.com/documentation/metal/mtlresidencyset?language=objc) to keep the GPU memory active?** https://github.com/ggml-org/llama.cpp/pull/11427
-- **VS Code extension for FIM completions:** https://github.com/ggml-org/llama.vscode
+- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
 - Introducing GGUF-my-LoRA https://github.com/ggml-org/llama.cpp/discussions/10123

From a4c340f974f9b7ac0c1aae897aabaa54549a97e5 Mon Sep 17 00:00:00 2001
From: Akarshan Biswas <akarshan@menlo.ai>
Date: Mon, 28 Apr 2025 15:03:25 +0530
Subject: [PATCH 049/200] SYCL: Add all missing unary kernels (#13074)

* SYCL: Add all missing unary kernels

ggml-ci

* decouple kernel launch range from data size using strided loop

* use ciel_div helper for num_blocks
ggml-ci

* clean auto imported header files
---
 ggml/src/ggml-sycl/common.hpp       |   4 +
 ggml/src/ggml-sycl/element_wise.cpp | 169 ++++++++++++++++++++++++++++
 ggml/src/ggml-sycl/element_wise.hpp |   5 +
 ggml/src/ggml-sycl/ggml-sycl.cpp    |  13 +++
 4 files changed, 191 insertions(+)

diff --git a/ggml/src/ggml-sycl/common.hpp b/ggml/src/ggml-sycl/common.hpp
index 0ab0fb0aa394d..c3d9d186456ac 100644
--- a/ggml/src/ggml-sycl/common.hpp
+++ b/ggml/src/ggml-sycl/common.hpp
@@ -493,5 +493,9 @@ static __dpct_inline__ Tp* get_pointer(sycl::local_accessor<Tp, dim> acc) {
 
 int64_t downsample_sycl_global_range(int64_t accumulate_block_num, int64_t block_size);
 
+constexpr size_t ceil_div(const size_t m, const size_t n) {
+    return (m + n - 1) / n;
+}
+
 bool gpu_has_xmx(sycl::device &dev);
 #endif // GGML_SYCL_COMMON_HPP
diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp
index fc25d98ddff1a..dcc6ec809a7d1 100644
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@@ -21,6 +21,27 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne,
     }
 }
 
+template<typename T>
+static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
+    for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
+        dst[i] = x[i] > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x[i] < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f)));
+    }
+}
+
+template<typename T>
+static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
+    for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
+        dst[i] = sycl::fabs(x[i]);
+    }
+}
+
+template<typename T>
+static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<3> &item_ct1) {
+    for(auto i = item_ct1.get_global_id(2); i < (const size_t)k; i += item_ct1.get_global_range(2)) {
+        dst[i] = (x[i] > static_cast<T>(0.f)) ? x[i] : sycl::expm1(x[i]);
+    }
+}
+
 template<typename T>
 static void gelu(const T * x, T * dst, const int k,
                      const sycl::nd_item<3> &item_ct1) {
@@ -335,6 +356,37 @@ static void silu_sycl(const T *x, T *dst, const int k,
         });
 }
 
+template<typename T>
+static void sgn_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
+    // hard code for now
+    const int num_blocks = ceil_div(k, 256);
+    stream->parallel_for(
+            sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range(1, 1, 256)), sycl::range(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
+            sgn(x, dst, k, item_ct1);
+            });
+}
+
+template<typename T>
+static void abs_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
+    // hard code for now
+    const int num_blocks = ceil_div(k, 256);
+    stream->parallel_for(
+            sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
+            abs_op(x, dst, k, item_ct1);
+            });
+}
+
+
+template<typename T>
+static void elu_sycl(const T * x, T * dst, const int k, queue_ptr stream) {
+    // hard code for now
+    const int num_blocks = ceil_div(k, 256);
+    stream->parallel_for(
+            sycl::nd_range<3>((sycl::range<3>(1, 1, num_blocks) * sycl::range<3>(1, 1, 256)), sycl::range<3>(1, 1, 256)), [=](sycl::nd_item<3> item_ct1) {
+            elu_op(x, dst, k, item_ct1);
+            });
+}
+
 template<typename T>
 static void gelu_quick_sycl(const T *x, T *dst, const int k,
                                 queue_ptr stream) {
@@ -574,6 +626,106 @@ static void clamp_sycl(const T *x, T *dst, const float min,
         });
 }
 
+inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
+    GGML_ASSERT(dst->src[0]->type == dst->type);
+    dpct::queue_ptr main_stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+    switch (dst->type) {
+#if defined (GGML_SYCL_F16)
+        case GGML_TYPE_F16:
+            {
+                auto data_pts = cast_data<sycl::half>(dst);
+                sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+#endif
+        case GGML_TYPE_F32:
+            {
+                auto data_pts = cast_data<float>(dst);
+                sgn_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
+}
+
+inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
+    GGML_ASSERT(dst->src[0]->type == dst->type);
+    dpct::queue_ptr main_stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+    switch (dst->type) {
+#if defined (GGML_SYCL_F16)
+        case GGML_TYPE_F16:
+            {
+                auto data_pts = cast_data<sycl::half>(dst);
+                abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+#endif
+        case GGML_TYPE_F32:
+            {
+                auto data_pts = cast_data<float>(dst);
+                abs_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
+}
+
+
+inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+#if defined (GGML_SYCL_F16)
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
+
+#else
+    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
+    GGML_ASSERT(dst->type == GGML_TYPE_F32);
+#endif
+    GGML_ASSERT(dst->src[0]->type == dst->type);
+    dpct::queue_ptr main_stream = ctx.stream();
+    SYCL_CHECK(ggml_sycl_set_device(ctx.device));
+    switch (dst->type) {
+#if defined (GGML_SYCL_F16)
+        case GGML_TYPE_F16:
+            {
+                auto data_pts = cast_data<sycl::half>(dst);
+                elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+#endif
+        case GGML_TYPE_F32:
+            {
+                auto data_pts = cast_data<float>(dst);
+                elu_sycl(data_pts.src, data_pts.dst, ggml_nelements(dst->src[0]), main_stream);
+                break;
+            }
+        default:
+            GGML_ABORT("GGML tensor type not supported!\n");
+            break;
+    }
+}
+
 inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
 #if defined (GGML_SYCL_F16)
     GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
@@ -1388,3 +1540,20 @@ void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
     GGML_SYCL_DEBUG("call %s done\n", __func__);
 }
 
+void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    ggml_sycl_op_sgn(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    ggml_sycl_op_abs(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
+
+void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+    GGML_SYCL_DEBUG("call %s: DST Tensor type: %s\n", __func__, ggml_type_name(dst->type));
+    ggml_sycl_op_elu(ctx, dst);
+    GGML_SYCL_DEBUG("call %s done\n", __func__);
+}
diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp
index e623cb56f7625..f4199d69da694 100644
--- a/ggml/src/ggml-sycl/element_wise.hpp
+++ b/ggml/src/ggml-sycl/element_wise.hpp
@@ -66,5 +66,10 @@ void ggml_sycl_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
 void ggml_sycl_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 
+void ggml_sycl_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
+
+void ggml_sycl_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst);
 #endif // GGML_SYCL_ELEMENTWISE_HPP
 
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 548f2d0a06be0..66b6f2cca4da9 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -38,6 +38,7 @@
 
 #include "ggml-sycl/backend.hpp"
 #include "ggml-sycl/common.hpp"
+#include "ggml-sycl/element_wise.hpp"
 #include "ggml-sycl/presets.hpp"
 #include "ggml-sycl/gemm.hpp"
 #include "ggml-sycl/sycl_hw.hpp"
@@ -3355,6 +3356,15 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg
                 case GGML_UNARY_OP_EXP:
                     ggml_sycl_exp(ctx, dst);
                     break;
+                case GGML_UNARY_OP_SGN:
+                    ggml_sycl_sgn(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_ABS:
+                    ggml_sycl_abs(ctx, dst);
+                    break;
+                case GGML_UNARY_OP_ELU:
+                    ggml_sycl_elu(ctx, dst);
+                    break;
                 default:
                     return false;
             }
@@ -3837,6 +3847,9 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
                 case GGML_UNARY_OP_GELU_QUICK:
                 case GGML_UNARY_OP_TANH:
                 case GGML_UNARY_OP_EXP:
+                case GGML_UNARY_OP_SGN:
+                case GGML_UNARY_OP_ABS:
+                case GGML_UNARY_OP_ELU:
 #if defined (GGML_SYCL_F16)
                     return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type);
 #else

From 5fa9e63be82225fb3249c76f39ddda3e5bdec0a3 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Mon, 28 Apr 2025 12:18:59 +0200
Subject: [PATCH 050/200] clip : refactor set input for cgraph + fix qwen2.5vl
 input (#13136)

* clip : refactor set input for cgraph

* more strict assert

* minicpmv : use clip_n_mmproj_embd instead of copying the same code everywhere

* split qwen2 and qwen2.5 code blocks

* minor style fix
---
 examples/llava/clip.cpp | 473 ++++++++++++++++++----------------------
 1 file changed, 215 insertions(+), 258 deletions(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 3cd27d5b17a08..8c5d56cc17ae9 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -170,8 +170,8 @@ struct clip_hparams {
     std::vector<int32_t> image_grid_pinpoints;
     int32_t image_crop_resolution;
     std::unordered_set<int32_t> vision_feature_layer;
-    int32_t attn_window_size;
-    int32_t n_wa_pattern;
+    int32_t attn_window_size = 0;
+    int32_t n_wa_pattern = 0;
 };
 
 struct clip_layer {
@@ -325,7 +325,6 @@ struct clip_ctx {
     float image_std[3];
     bool use_gelu = false;
     bool use_silu = false;
-    int32_t ftype = 1;
 
     gguf_context_ptr ctx_gguf;
     ggml_context_ptr ctx_data;
@@ -776,7 +775,6 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
     const int image_size_width  = imgs.entries[0]->nx;
     const int image_size_height = imgs.entries[0]->ny;
 
-    const bool use_mrope       = ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
     const bool use_window_attn = hparams.n_wa_pattern > 0;
 
     const int n_wa_pattern         = hparams.n_wa_pattern;
@@ -785,10 +783,11 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
     const int patches_w            = image_size_width / patch_size;
     const int patches_h            = image_size_height / patch_size;
     const int num_positions        = num_patches + (model.class_embedding ? 1 : 0);
-    const int num_position_ids     = use_mrope ? num_positions * 4 : num_positions;
+    const int num_position_ids     = num_positions * 4; // m-rope requires 4 dim per position
     const int hidden_size          = hparams.hidden_size;
     const int n_head               = hparams.n_head;
     const int d_head               = hidden_size / n_head;
+    const int n_layer              = hparams.n_layer;
     const float eps                = hparams.eps;
 
     int mrope_sections[4] = {d_head/4, d_head/4, d_head/4, d_head/4};
@@ -870,7 +869,7 @@ static ggml_cgraph * clip_image_build_graph_qwen25vl(clip_ctx * ctx, const clip_
     }
 
     // loop over layers
-    for (int il = 0; il < ctx->max_feature_layer; il++) {
+    for (int il = 0; il < n_layer; il++) {
         struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
 
         // rmsnorm1
@@ -1115,15 +1114,8 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
     if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
         int pos_w = image_size_width/patch_size;
         int pos_h = image_size_height/patch_size;
-        if (ctx->minicpmv_version == 2) {
-            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
-        }
-        else if (ctx->minicpmv_version == 3) {
-            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
-        }
-        else if (ctx->minicpmv_version == 4) {
-            pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 3584, pos_w * pos_h, 1);
-        }
+        int n_output_dim = clip_n_mmproj_embd(ctx);
+        pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_output_dim, pos_w * pos_h, 1);
         ggml_set_name(pos_embed, "pos_embed");
         ggml_set_input(pos_embed);
     }
@@ -1461,23 +1453,17 @@ static ggml_cgraph * clip_image_build_graph_legacy(clip_ctx * ctx, const clip_im
         }
 
         { // attention
-            int hidden_size = 4096;
+            int hidden_size = clip_n_mmproj_embd(ctx);
             const int d_head = 128;
             int n_head = hidden_size/d_head;
             int num_query = 96;
             if (ctx->minicpmv_version == 2) {
-                hidden_size = 4096;
-                n_head = hidden_size/d_head;
                 num_query = 96;
             }
             else if (ctx->minicpmv_version == 3) {
-                hidden_size = 3584;
-                n_head = hidden_size/d_head;
                 num_query = 64;
             }
             else if (ctx->minicpmv_version == 4) {
-                hidden_size = 3584;
-                n_head = hidden_size/d_head;
                 num_query = 64;
             }
 
@@ -1760,6 +1746,8 @@ struct clip_model_loader {
             LOG_INF("%s: projector:          %s\n", __func__, proj_type.c_str());
             LOG_INF("%s: has_llava_proj:     %d\n", __func__, ctx_clip.has_llava_projector);
             LOG_INF("%s: minicpmv_version:   %d\n", __func__, ctx_clip.minicpmv_version);
+            LOG_INF("%s: proj_scale_factor:  %d\n", __func__, hparams.proj_scale_factor);
+            LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
             LOG_INF("%s: model size:         %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
             LOG_INF("%s: metadata size:      %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
         }
@@ -3038,15 +3026,43 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     const int patch_size    = hparams.patch_size;
     const int num_patches   = ((image_size_width / patch_size) * (image_size_height / patch_size));
     const int num_positions = num_patches + (model.class_embedding ? 1 : 0);
-    const int pos_w = ctx->load_image_size.width / patch_size;
+    const int pos_w = ctx->load_image_size.width  / patch_size;
     const int pos_h = ctx->load_image_size.height / patch_size;
 
     const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
 
+    auto get_inp_tensor = [&gf](const char * name) {
+        struct ggml_tensor * inp = ggml_graph_get_tensor(gf, name);
+        if (inp == nullptr) {
+            GGML_ABORT("Failed to get tensor %s", name);
+        }
+        if (!(inp->flags & GGML_TENSOR_FLAG_INPUT)) {
+            GGML_ABORT("Tensor %s is not an input tensor", name);
+        }
+        return inp;
+    };
+
+    auto set_input_f32 = [&get_inp_tensor](const char * name, std::vector<float> & values) {
+        ggml_tensor * cur = get_inp_tensor(name);
+        GGML_ASSERT(cur->type == GGML_TYPE_F32);
+        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
+        ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
+    };
+
+    auto set_input_i32 = [&get_inp_tensor](const char * name, std::vector<int32_t> & values) {
+        ggml_tensor * cur = get_inp_tensor(name);
+        GGML_ASSERT(cur->type == GGML_TYPE_I32);
+        GGML_ASSERT(ggml_nelements(cur) == (int64_t)values.size());
+        ggml_backend_tensor_set(cur, values.data(), 0, ggml_nbytes(cur));
+    };
+
+    // set input pixel values
     {
-        struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
-        std::vector<float> inp_data(ggml_nelements(inp_raw));
-        float * data = inp_data.data();
+        size_t nelem = 0;
+        for (const auto & img : imgs.entries) {
+            nelem += img->nx * img->ny * 3;
+        }
+        std::vector<float> inp_raw(nelem);
 
         // layout of data (note: the channel dim is unrolled to better visualize the layout):
         //
@@ -3065,7 +3081,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
             const int n = nx * ny;
 
             for (int b = 0; b < batch_size; b++) {
-                float * batch_entry = data + b * (3*n);
+                float * batch_entry = inp_raw.data() + b * (3*n);
                 for (int y = 0; y < ny; y++) {
                     for (int x = 0; x < nx; x++) {
                         size_t base_src = 3*(y * nx + x); // idx of the first channel
@@ -3077,266 +3093,207 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
                 }
             }
         }
-        ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
+        set_input_f32("inp_raw", inp_raw);
     }
 
-    if (ctx->proj_type == PROJECTOR_TYPE_MINICPMV) {
-        {
-            // inspired from siglip:
-            //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
-            //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
-            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
-            std::vector<int> pos_data(ggml_nelements(positions));
-            int * data = pos_data.data();
-            int bucket_coords_h[1024];
-            int bucket_coords_w[1024];
-            for (int i = 0; i < pos_h; i++){
-                bucket_coords_h[i] = std::floor(70.0*i/pos_h);
-            }
-            for (int i = 0; i < pos_w; i++){
-                bucket_coords_w[i] = std::floor(70.0*i/pos_w);
-            }
-            for (int i = 0, id = 0; i < pos_h; i++){
-                for (int j = 0; j < pos_w; j++){
-                    data[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
+    // set input per projector
+    switch (ctx->proj_type) {
+        case PROJECTOR_TYPE_MINICPMV:
+            {
+                // inspired from siglip:
+                //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
+                //    -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
+                std::vector<int32_t> positions(pos_h * pos_w);
+                int bucket_coords_h[1024];
+                int bucket_coords_w[1024];
+                for (int i = 0; i < pos_h; i++){
+                    bucket_coords_h[i] = std::floor(70.0*i/pos_h);
                 }
-            }
-            ggml_backend_tensor_set(positions, data, 0, ggml_nbytes(positions));
-        }
+                for (int i = 0; i < pos_w; i++){
+                    bucket_coords_w[i] = std::floor(70.0*i/pos_w);
+                }
+                for (int i = 0, id = 0; i < pos_h; i++){
+                    for (int j = 0; j < pos_w; j++){
+                        positions[id++] = bucket_coords_h[i]*70 + bucket_coords_w[j];
+                    }
+                }
+                set_input_i32("positions", positions);
 
-        {
-            // inspired from resampler of Qwen-VL:
-            //    -> https://huggingface.co/Qwen/Qwen-VL/tree/main
-            //    -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
-            struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
-            int embed_dim = 4096;
-            if (ctx->minicpmv_version == 2) {
-                embed_dim = 4096;
-            }
-            else if (ctx->minicpmv_version == 3) {
-                embed_dim = 3584;
-            }
-            else if (ctx->minicpmv_version == 4) {
-                embed_dim = 3584;
-            }
-            else {
-                GGML_ABORT("Unknown minicpmv version");
-            }
+                // inspired from resampler of Qwen-VL:
+                //    -> https://huggingface.co/Qwen/Qwen-VL/tree/main
+                //    -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
+                int embed_dim = clip_n_mmproj_embd(ctx);
 
-            // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
-            auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
+                // TODO @ngxson : this is very inefficient, can we do this using ggml_sin and ggml_cos?
+                auto pos_embed_t = get_2d_sincos_pos_embed(embed_dim, std::make_pair(pos_w, pos_h));
 
-            std::vector<float> pos_data(ggml_nelements(pos_embed));
-            float * data = pos_data.data();
-            for(int i = 0; i < pos_w * pos_h; ++i){
-                for(int j = 0; j < embed_dim; ++j){
-                    data[i * embed_dim + j] = pos_embed_t[i][j];
+                std::vector<float> pos_embed(embed_dim * pos_w * pos_h);
+                for(int i = 0; i < pos_w * pos_h; ++i){
+                    for(int j = 0; j < embed_dim; ++j){
+                        pos_embed[i * embed_dim + j] = pos_embed_t[i][j];
+                    }
                 }
-            }
 
-            ggml_backend_tensor_set(pos_embed, data, 0, ggml_nbytes(pos_embed));
-        }
-    }
-    else {
-        // non-minicpmv models
-
-        if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
-            // pw * ph = number of tokens output by ViT after apply patch merger
-            // ipw * ipw = number of vision token been processed inside ViT
-            const int merge_ratio = 2;
-            const int pw  = image_size_width  / patch_size / merge_ratio;
-            const int ph  = image_size_height / patch_size / merge_ratio;
-            const int ipw = image_size_width  / patch_size;
-            const int iph = image_size_height / patch_size;
-
-            std::vector<int> idx    (ph * pw);
-            std::vector<int> inv_idx(ph * pw);
-
-            if (use_window_attn) {
-                const int attn_window_size = 112;
-                struct ggml_tensor * window_idx     = ggml_graph_get_tensor(gf, "window_idx");
-                struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx");
-                struct ggml_tensor * window_mask    = ggml_graph_get_tensor(gf, "window_mask");
-
-                const int grid_window = attn_window_size / patch_size / merge_ratio;
-                int dst = 0;
-                // [num_vision_tokens, num_vision_tokens] attention mask tensor
-                std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
-                int mask_row = 0;
-
-                for (int y = 0; y < ph; y += grid_window)
-                {
-                    for (int x = 0; x < pw; x += grid_window)
-                    {
-                        const int win_h = std::min(grid_window, ph - y);
-                        const int win_w = std::min(grid_window, pw - x);
-                        const int dst_0 = dst;
-                        // group all tokens belong to the same window togather (to a continue range)
-                        for (int dy = 0; dy < win_h; dy++) {
-                            for (int dx = 0; dx < win_w; dx++) {
-                                const int src = (y + dy) * pw + (x + dx);
-                                assert(src < (int)idx.size());
-                                assert(dst < (int)inv_idx.size());
-                                idx    [src] = dst;
-                                inv_idx[dst] = src;
-                                dst++;
+                set_input_f32("pos_embed", pos_embed);
+            } break;
+        case PROJECTOR_TYPE_QWEN2VL:
+            {
+                const int merge_ratio = 2;
+                const int pw = image_size_width  / patch_size;
+                const int ph = image_size_height / patch_size;
+                std::vector<int> positions(num_positions * 4);
+                int ptr = 0;
+                for (int y = 0; y < ph; y += merge_ratio) {
+                    for (int x = 0; x < pw; x += merge_ratio) {
+                        for (int dy = 0; dy < 2; dy++) {
+                            for (int dx = 0; dx < 2; dx++) {
+                                positions[                  ptr] = y + dy;
+                                positions[    num_patches + ptr] = x + dx;
+                                positions[2 * num_patches + ptr] = y + dy;
+                                positions[3 * num_patches + ptr] = x + dx;
+                                ptr++;
                             }
                         }
-
-                        for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
-                            int row_offset = mask_row * (ipw * iph);
-                            std::fill(
-                                mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
-                                mask.begin() + row_offset + (dst   * merge_ratio * merge_ratio),
-                                0.0);
-                            mask_row++;
-                        }
                     }
                 }
 
-                ggml_backend_tensor_set(window_idx,     idx.data(),     0, ggml_nbytes(window_idx));
-                ggml_backend_tensor_set(inv_window_idx, inv_idx.data(), 0, ggml_nbytes(inv_window_idx));
-                ggml_backend_tensor_set(window_mask,    mask.data(),    0, ggml_nbytes(window_mask));
-            } else {
-                std::iota(idx.begin(), idx.end(), 0);
-                std::iota(inv_idx.begin(), inv_idx.end(), 0);
-            }
+                set_input_i32("positions", positions);
+            } break;
+        case PROJECTOR_TYPE_QWEN25VL:
+            {
+                // pw * ph = number of tokens output by ViT after apply patch merger
+                // ipw * ipw = number of vision token been processed inside ViT
+                const int merge_ratio = 2;
+                const int pw  = image_size_width  / patch_size / merge_ratio;
+                const int ph  = image_size_height / patch_size / merge_ratio;
+                const int ipw = image_size_width  / patch_size;
+                const int iph = image_size_height / patch_size;
+
+                std::vector<int> idx    (ph * pw);
+                std::vector<int> inv_idx(ph * pw);
+
+                if (use_window_attn) {
+                    const int attn_window_size = 112;
+                    const int grid_window = attn_window_size / patch_size / merge_ratio;
+                    int dst = 0;
+                    // [num_vision_tokens, num_vision_tokens] attention mask tensor
+                    std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
+                    int mask_row = 0;
+
+                    for (int y = 0; y < ph; y += grid_window) {
+                        for (int x = 0; x < pw; x += grid_window) {
+                            const int win_h = std::min(grid_window, ph - y);
+                            const int win_w = std::min(grid_window, pw - x);
+                            const int dst_0 = dst;
+                            // group all tokens belong to the same window togather (to a continue range)
+                            for (int dy = 0; dy < win_h; dy++) {
+                                for (int dx = 0; dx < win_w; dx++) {
+                                    const int src = (y + dy) * pw + (x + dx);
+                                    GGML_ASSERT(src < (int)idx.size());
+                                    GGML_ASSERT(dst < (int)inv_idx.size());
+                                    idx    [src] = dst;
+                                    inv_idx[dst] = src;
+                                    dst++;
+                                }
+                            }
 
-            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
-            const int mpow = merge_ratio * merge_ratio;
-            std::vector<int> positions_data(ggml_nelements(positions));
-            int * data = positions_data.data();
+                            for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
+                                int row_offset = mask_row * (ipw * iph);
+                                std::fill(
+                                    mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
+                                    mask.begin() + row_offset + (dst   * merge_ratio * merge_ratio),
+                                    0.0);
+                                mask_row++;
+                            }
+                        }
+                    }
 
-            int ptr = 0;
-            for (int y = 0; y < iph; y += merge_ratio)
-            {
-                for (int x = 0; x < ipw; x += merge_ratio)
-                {
-                    for (int dy = 0; dy < 2; dy++) {
-                        for (int dx = 0; dx < 2; dx++) {
-                            auto remap = idx[ptr / mpow];
-                            remap = remap * mpow + (ptr % mpow);
-
-                            data[                  remap] = y + dy;
-                            data[    num_patches + remap] = x + dx;
-                            data[2 * num_patches + remap] = y + dy;
-                            data[3 * num_patches + remap] = x + dx;
-                            ptr++;
+                    set_input_i32("window_idx",     idx);
+                    set_input_i32("inv_window_idx", inv_idx);
+                    set_input_f32("window_mask",    mask);
+                } else {
+                    for (int i = 0; i < ph * pw; i++) {
+                        idx[i] = i;
+                    }
+                }
+
+                const int mpow = merge_ratio * merge_ratio;
+                std::vector<int> positions(num_positions * 4);
+
+                int ptr = 0;
+                for (int y = 0; y < iph; y += merge_ratio) {
+                    for (int x = 0; x < ipw; x += merge_ratio) {
+                        for (int dy = 0; dy < 2; dy++) {
+                            for (int dx = 0; dx < 2; dx++) {
+                                auto remap = idx[ptr / mpow];
+                                remap = (remap * mpow) + (ptr % mpow);
+
+                                positions[                  remap] = y + dy;
+                                positions[    num_patches + remap] = x + dx;
+                                positions[2 * num_patches + remap] = y + dy;
+                                positions[3 * num_patches + remap] = x + dx;
+                                ptr++;
+                            }
                         }
                     }
                 }
-            }
 
-            ggml_backend_tensor_set(positions, data, 0, ggml_nbytes(positions));
-        }
-        else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
-            // do nothing
-        }
-        else if (ctx->proj_type == PROJECTOR_TYPE_IDEFICS3) {
-            // do nothing
-        }
-        else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
-            // set the 2D positions
-            int n_patches_per_col = image_size_width / patch_size;
-            std::vector<int> pos_data(num_positions);
-            struct ggml_tensor * pos;
-            // dimension H
-            pos = ggml_graph_get_tensor(gf, "pos_h");
-            for (int i = 0; i < num_positions; i++) {
-                pos_data[i] = i / n_patches_per_col;
-            }
-            ggml_backend_tensor_set(pos, pos_data.data(), 0, ggml_nbytes(pos));
-            // dimension W
-            pos = ggml_graph_get_tensor(gf, "pos_w");
-            for (int i = 0; i < num_positions; i++) {
-                pos_data[i] = i % n_patches_per_col;
-            }
-            ggml_backend_tensor_set(pos, pos_data.data(), 0, ggml_nbytes(pos));
-        }
-        else {
+                set_input_i32("positions", positions);
+            } break;
+        case PROJECTOR_TYPE_PIXTRAL:
+            {
+                // set the 2D positions
+                int n_patches_per_col = image_size_width / patch_size;
+                std::vector<int> pos_data(num_positions);
+                // dimension H
+                for (int i = 0; i < num_positions; i++) {
+                    pos_data[i] = i / n_patches_per_col;
+                }
+                set_input_i32("pos_h", pos_data);
+                // dimension W
+                for (int i = 0; i < num_positions; i++) {
+                    pos_data[i] = i % n_patches_per_col;
+                }
+                set_input_i32("pos_w", pos_data);
+            } break;
+        case PROJECTOR_TYPE_GLM_EDGE:
+        {
             // llava and other models
-            struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
-
-            int* positions_data = (int*)malloc(ggml_nbytes(positions));
+            std::vector<int32_t> positions(num_positions);
             for (int i = 0; i < num_positions; i++) {
-                positions_data[i] = i;
+                positions[i] = i;
             }
-            ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
-            free(positions_data);
+            set_input_i32("positions", positions);
+        } break;
+        case PROJECTOR_TYPE_MLP:
+        case PROJECTOR_TYPE_MLP_NORM:
+        case PROJECTOR_TYPE_LDP:
+        case PROJECTOR_TYPE_LDPV2:
+            {
+                // llava and other models
+                std::vector<int32_t> positions(num_positions);
+                for (int i = 0; i < num_positions; i++) {
+                    positions[i] = i;
+                }
+                set_input_i32("positions", positions);
 
-            if (ctx->proj_type != PROJECTOR_TYPE_GLM_EDGE) {
-                struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
                 // The patches vector is used to get rows to index into the embeds with;
                 // we should skip dim 0 only if we have CLS to avoid going out of bounds
                 // when retrieving the rows.
                 int patch_offset = model.class_embedding ? 1 : 0;
-                int* patches_data = (int*)malloc(ggml_nbytes(patches));
+                std::vector<int32_t> patches(num_patches);
                 for (int i = 0; i < num_patches; i++) {
-                    patches_data[i] = i + patch_offset;
+                    patches[i] = i + patch_offset;
                 }
-                ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
-                free(patches_data);
-            }
-        }
-    }
-
-    if (use_window_attn && (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL)) {
-        struct ggml_tensor * window_idx = ggml_graph_get_tensor(gf, "window_idx");
-        struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx");
-        struct ggml_tensor * window_mask = ggml_graph_get_tensor(gf, "window_mask");
-
-        const int merge_ratio = 2;
-        const int attn_window_size = 112;
-        const int pw = image_size_width / patch_size / merge_ratio;
-        const int ph = image_size_height / patch_size / merge_ratio;
-        const int grid_window = attn_window_size / patch_size / merge_ratio;
-        const int ipw = image_size_width / patch_size;
-        const int iph = image_size_height / patch_size;
-        /*
-        pw * ph = number of tokens output by ViT after apply patch merger
-        ipw * ipw = number of vision token been processed inside ViT
-        */
-
-        std::vector<int> idx(ph * pw);
-        std::vector<int> inv_idx(ph * pw);
-        int dst = 0;
-        // [num_vision_tokens, num_vision_tokens] attention mask tensor
-        std::vector<float> mask(pow(ipw * iph, 2), std::numeric_limits<float>::lowest());
-        int mask_row = 0;
-
-        for (int y = 0; y < ph; y+=grid_window)
-        {
-            for (int x = 0; x < pw; x+=grid_window)
+                set_input_i32("patches", patches);
+            } break;
+        case PROJECTOR_TYPE_GEMMA3:
+        case PROJECTOR_TYPE_IDEFICS3:
             {
-                const int win_h = std::min(grid_window, ph - y);
-                const int win_w = std::min(grid_window, pw - x);
-                const int dst_0 = dst;
-                // group all tokens belong to the same window togather (to a continue range)
-                for (int dy = 0; dy < win_h; dy++) {
-                    for (int dx = 0; dx < win_w; dx++) {
-                        const int src = (y + dy) * pw + (x + dx);
-                        assert(src < (int)idx.size());
-                        assert(dst < (int)inv_idx.size());
-                        idx[src] = dst;
-                        inv_idx[dst] = src;
-                        dst++;
-                    }
-                }
-
-                for (int r=0; r < win_h * win_w * merge_ratio * merge_ratio; r++) {
-                    int row_offset = mask_row * (ipw * iph);
-                    std::fill(
-                        mask.begin() + row_offset + (dst_0 * merge_ratio * merge_ratio),
-                        mask.begin() + row_offset + (dst   * merge_ratio * merge_ratio),
-                        0.0);
-                    mask_row++;
-                }
-            }
-        }
-
-        ggml_backend_tensor_set(window_idx, idx.data(), 0, ggml_nbytes(window_idx));
-        ggml_backend_tensor_set(inv_window_idx, inv_idx.data(), 0, ggml_nbytes(inv_window_idx));
-        ggml_backend_tensor_set(window_mask, mask.data(), 0, ggml_nbytes(window_mask));
+                // do nothing
+            } break;
+        default:
+            GGML_ABORT("Unknown projector type");
     }
 
     ggml_backend_cpu_set_n_threads(ctx->backend_cpu, n_threads);
@@ -3537,7 +3494,7 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
 }
 
 bool clip_is_qwen2vl(const struct clip_ctx * ctx) {
-    return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL;
+    return ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL;
 }
 
 bool clip_is_llava(const struct clip_ctx * ctx) {

From d2b2031e5f11b826dcc718138642f147a2009665 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Mon, 28 Apr 2025 14:20:56 +0200
Subject: [PATCH 051/200] llama : (mrope) allow using normal 1D position for
 text token (#13138)

* llama : (mrope) use normal position for text token

* rm n_pos_per_embd from llm_graph_input_attn_temp
---
 examples/llava/qwen2vl-cli.cpp |  8 --------
 src/llama-graph.cpp            | 26 +++++++++++++++++++-------
 src/llama-graph.h              | 12 +++++-------
 3 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp
index cf42710869191..1e54851ea07a0 100644
--- a/examples/llava/qwen2vl-cli.cpp
+++ b/examples/llava/qwen2vl-cli.cpp
@@ -92,20 +92,12 @@ static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct lla
 
 static bool eval_tokens(struct llama_context * ctx_llama, std::vector<llama_token> tokens, int n_batch, int * n_past, int * st_pos_id) {
     int N = (int) tokens.size();
-    std::vector<llama_pos> pos;
     for (int i = 0; i < N; i += n_batch) {
         int n_eval = (int) tokens.size() - i;
         if (n_eval > n_batch) {
             n_eval = n_batch;
         }
         auto batch = llama_batch_get_one(&tokens[i], n_eval);
-        // TODO: add mrope pos ids somewhere else
-        pos.resize(batch.n_tokens * 4);
-        std::fill(pos.begin(), pos.end(), 0);
-        for (int j = 0; j < batch.n_tokens * 3; j ++) {
-            pos[j] = *st_pos_id + (j % batch.n_tokens);
-        }
-        batch.pos = pos.data();
 
         if (llama_decode(ctx_llama, batch)) {
             LOG_ERR("%s : failed to eval. token %d/%d (batch size %d, n_past %d)\n", __func__, i, N, n_batch, *n_past);
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index b52e3f6203a4b..e6595fb18bc5b 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -55,7 +55,18 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
     if (ubatch->pos && pos) {
         const int64_t n_tokens = ubatch->n_tokens;
 
-        ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
+        if (ubatch->token && n_pos_per_embd > 1) {
+            // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
+            // the other dimensions are all 0, they are unused for text tokens
+            std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd, 0);
+            // copy the first dimension
+            for (int i = 0; i < n_tokens; ++i) {
+                pos_data[i] = ubatch->pos[i];
+            }
+            ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
+        } else {
+            ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
+        }
     }
 }
 
@@ -71,7 +82,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
             ) * f_attn_temp_scale + 1.0;
         }
 
-        ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
+        ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
     }
 }
 
@@ -592,7 +603,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
     res              (std::make_unique<llm_graph_result>()) {
     }
 
-int64_t llm_graph_context::n_pos_per_token() const {
+int64_t llm_graph_context::n_pos_per_embd() const {
     return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
 }
 
@@ -1018,11 +1029,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_pos() const {
-    auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
+    auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
 
     auto & cur = inp->pos;
 
-    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
+    cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
     ggml_set_input(cur);
 
     res->add_input(std::move(inp));
@@ -1031,11 +1042,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
 }
 
 ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
-    auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
+    auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
 
     auto & cur = inp->attn_scale;
 
-    cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
+    // this need to be 1x1xN for broadcasting
+    cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
     ggml_set_input(cur);
 
     res->add_input(std::move(inp));
diff --git a/src/llama-graph.h b/src/llama-graph.h
index d192dc1495787..d0c8d32192784 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -90,29 +90,27 @@ class llm_graph_input_embd : public llm_graph_input_i {
 
 class llm_graph_input_pos : public llm_graph_input_i {
 public:
-    llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
+    llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
     virtual ~llm_graph_input_pos() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
 
     ggml_tensor * pos = nullptr; // I32 [n_batch]
 
-    const int64_t n_pos_per_token = 1;
+    const int64_t n_pos_per_embd = 1;
 };
 
 // temperature tuning, used by llama4
 class llm_graph_input_attn_temp : public llm_graph_input_i {
 public:
-    llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
-        : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
+    llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
+        : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
     virtual ~llm_graph_input_attn_temp() = default;
 
     void set_input(const llama_ubatch * ubatch) override;
 
     ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
 
-    const int64_t n_pos_per_token = 1;
-
     const uint32_t n_attn_temp_floor_scale;
     const float    f_attn_temp_scale;
 };
@@ -419,7 +417,7 @@ struct llm_graph_context {
 
     llm_graph_context(const llm_graph_params & params);
 
-    int64_t n_pos_per_token() const;
+    int64_t n_pos_per_embd() const;
 
     void cb(ggml_tensor * cur, const char * name, int il) const;
 

From fb0471d1753824e75474c24f82fbdd54c94dceda Mon Sep 17 00:00:00 2001
From: pockers21 <134406831+pockers21@users.noreply.github.com>
Date: Mon, 28 Apr 2025 06:45:40 -0700
Subject: [PATCH 052/200] context : do not clear output buffer on reserve
 (#13152)

Co-authored-by: pockers21 <liyang2@uniontech.com>
---
 src/llama-context.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index a52b6850b465d..e49225aa22433 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1536,8 +1536,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
     // set all ids as invalid (negative)
     std::fill(output_ids.begin(), output_ids.end(), -1);
 
-    ggml_backend_buffer_clear(buf_output.get(), 0);
-
     this->n_outputs     = 0;
     this->n_outputs_max = n_outputs_max;
 

From 4e87962e34a4b257ec374c4baf6b1568554b81a9 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Mon, 28 Apr 2025 16:12:56 +0200
Subject: [PATCH 053/200] mtmd : fix glm-edge redundant token count (#13139)

* mtmd : fix glm-edge redundant token count

* fix chat template

* temporary disable GLMEdge test chat tmpl
---
 examples/llava/mtmd.cpp      | 10 +---------
 src/llama-chat.cpp           | 10 +---------
 tests/test-chat-template.cpp | 17 +++++++++--------
 3 files changed, 11 insertions(+), 26 deletions(-)

diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index a994ef0166e6a..f95f0503569f9 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -203,9 +203,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
     }
 
     // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
-    // for glm-edge, we don't need to add because the tokens are already in the returned embeddings
-
-    // TODO @ngxson : glm-edge : remove BOI / EOI tokens embeddings, decode them as normal tokens
 
     std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
     output.clear();
@@ -246,7 +243,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
     };
 
     for (const auto & part : parts) {
-        //printf("tokenizing part: %s\n", part.c_str());
+        // printf("tokenizing part: %s\n", part.c_str());
         bool add_bos = &parts.front() == &part;
         auto tokens = mtmd_tokenize_text_internal(vocab, part, text.add_special && add_bos, text.parse_special);
         if (tokens.empty()) {
@@ -338,11 +335,6 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
                 LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
                 LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
 
-                if (clip_is_glm(ctx->ctx_clip)) {
-                    // glm-edge
-                    image_tokens->nx += 2; // add 2 for the begin_of_image and end_of_image token embeddings
-                }
-
                 mtmd_input_chunk chunk{
                     MTMD_INPUT_CHUNK_TYPE_IMAGE,
                     {},
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp
index af5e2003198d8..735d2619c928f 100644
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -447,7 +447,7 @@ int32_t llm_chat_apply_template(
         if (add_ass) {
             ss << "<|assistant|>";
         }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
         ss << "[gMASK]" << "<sop>";
         for (auto message : chat) {
             std::string role(message->role);
@@ -456,14 +456,6 @@ int32_t llm_chat_apply_template(
         if (add_ass) {
             ss << "<|assistant|>";
         }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
-        for (auto message : chat) {
-            std::string role(message->role);
-            ss << "<|" << role << "|>" << "\n" << message->content;
-        }
-        if (add_ass) {
-            ss << "<|assistant|>";
-        }
     } else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
         // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
         for (auto message : chat) {
diff --git a/tests/test-chat-template.cpp b/tests/test-chat-template.cpp
index be1a640068dc7..85d89843d6d96 100644
--- a/tests/test-chat-template.cpp
+++ b/tests/test-chat-template.cpp
@@ -187,14 +187,15 @@ int main(void) {
             /* .bos_token= */ "",
             /* .eos_token= */ "",
         },
-        {
-            /* .name= */ "GLMEdge",
-            /* .template_str= */ "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}<|assistant|>",
-            /* .expected_output= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>",
-            /* .expected_output_jinja= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>",
-            /* .bos_token= */ "",
-            /* .eos_token= */ "",
-        },
+        // TODO @ngxson : GLMEdge produces poor result without `[gMASK]<sop>`, so we're temporarily using GLM4 template for it. We should fix this in the future.
+        // {
+        //     /* .name= */ "GLMEdge",
+        //     /* .template_str= */ "{% for item in messages %}{% if item['role'] == 'system' %}<|system|>\n{{ item['content'] }}{% elif item['role'] == 'user' %}<|user|>\n{{ item['content'] }}{% elif item['role'] == 'assistant' %}<|assistant|>\n{{ item['content'] }}{% endif %}{% endfor %}<|assistant|>",
+        //     /* .expected_output= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>",
+        //     /* .expected_output_jinja= */ "<|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n   I am an assistant   <|user|>\nAnother question<|assistant|>",
+        //     /* .bos_token= */ "",
+        //     /* .eos_token= */ "",
+        // },
         {
             /* .name= */ "MiniCPM-3B-OpenHermes-2.5-v2-GGUF",
             /* .template_str= */ U8C("{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}"),

From 1831f538f720d1d99fba146f24f0a8e970838cc4 Mon Sep 17 00:00:00 2001
From: Vishal Agarwal <vishalagarwal.jss@gmail.com>
Date: Mon, 28 Apr 2025 20:20:39 +0530
Subject: [PATCH 054/200] llama-bench: add `-d` depth arg (#13096)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add depth param

* update llama-bench README and add depth param

* llama-bench: default params for depth arg for faster execution

* Update examples/llama-bench/README.md

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* fix buffer print ub

* use user provided args

* remove extra whitespaces

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
---
 examples/llama-bench/README.md       | 155 +++++++++++++++++----------
 examples/llama-bench/llama-bench.cpp |  47 ++++++--
 2 files changed, 137 insertions(+), 65 deletions(-)

diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md
index 6bbe4bb75fbf8..1f5e2f66200a6 100644
--- a/examples/llama-bench/README.md
+++ b/examples/llama-bench/README.md
@@ -28,6 +28,7 @@ options:
   -p, --n-prompt <n>                        (default: 512)
   -n, --n-gen <n>                           (default: 128)
   -pg <pp,tg>                               (default: )
+  -d, --n-depth <n>                         (default: 0)
   -b, --batch-size <n>                      (default: 2048)
   -ub, --ubatch-size <n>                    (default: 512)
   -ctk, --cache-type-k <t>                  (default: f16)
@@ -66,6 +67,8 @@ With the exception of `-r`, `-o` and `-v`, all options can be specified multiple
 
 Each test is repeated the number of times given by `-r`, and the results are averaged. The results are given in average tokens per second (t/s) and standard deviation. Some output formats (e.g. json) also include the individual results of each repetition.
 
+Using the `-d <n>` option, each test can be run at a specified context depth, prefilling the KV cache with `<n>` tokens.
+
 For a description of the other options, see the [main example](../main/README.md).
 
 Note:
@@ -148,6 +151,19 @@ $ ./llama-bench -ngl 10,20,30,31,32,33,34,35
 | llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | pp 512     |   2400.01 ± 7.72 |
 | llama 7B mostly Q4_0           |   3.56 GiB |     6.74 B | CUDA       |  35 | tg 128     |    131.66 ± 0.49 |
 
+### Different prefilled context
+
+```
+$ ./llama-bench -d 0,512
+```
+
+| model                          |       size |     params | backend    | ngl |            test |                  t/s |
+| ------------------------------ | ---------: | ---------: | ---------- | --: | --------------: | -------------------: |
+| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |           pp512 |      7340.20 ± 23.45 |
+| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |           tg128 |        120.60 ± 0.59 |
+| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |    pp512 @ d512 |      6425.91 ± 18.88 |
+| qwen2 7B Q4_K - Medium         |   4.36 GiB |     7.62 B | CUDA       |  99 |    tg128 @ d512 |        116.71 ± 0.60 |
+
 ## Output formats
 
 By default, llama-bench outputs the results in markdown format. The results can be output in other formats by using the `-o` option.
@@ -170,9 +186,9 @@ $ ./llama-bench -o csv
 ```
 
 ```csv
-build_commit,build_number,cuda,metal,gpu_blas,blas,cpu_info,gpu_info,model_filename,model_type,model_size,model_n_params,n_batch,n_threads,f16_kv,n_gpu_layers,main_gpu,mul_mat_q,tensor_split,n_prompt,n_gen,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
-"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","512","0","2023-09-23T12:09:01Z","212155977","732372","2413.341687","8.305961"
-"3469684","1275","1","0","0","1","1","13th Gen Intel(R) Core(TM) i9-13900K","NVIDIA GeForce RTX 3090 Ti","models/7B/ggml-model-q4_0.gguf","llama 7B mostly Q4_0","3825065984","6738415616","512","16","1","99","0","1","0.00","0","128","2023-09-23T12:09:02Z","969320879","2728399","132.052051","0.371342"
+build_commit,build_number,cpu_info,gpu_info,backends,model_filename,model_type,model_size,model_n_params,n_batch,n_ubatch,n_threads,cpu_mask,cpu_strict,poll,type_k,type_v,n_gpu_layers,split_mode,main_gpu,no_kv_offload,flash_attn,tensor_split,use_mmap,embeddings,n_prompt,n_gen,n_depth,test_time,avg_ns,stddev_ns,avg_ts,stddev_ts
+"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","512","0","0","2025-04-24T11:57:09Z","70285660","982040","7285.676949","100.064434"
+"8cf427ff","5163","AMD Ryzen 7 7800X3D 8-Core Processor","NVIDIA GeForce RTX 4080","CUDA","models/Qwen2.5-7B-Instruct-Q4_K_M.gguf","qwen2 7B Q4_K - Medium","4677120000","7615616512","2048","512","8","0x0","0","50","f16","f16","99","layer","0","0","0","0.00","1","0","0","128","0","2025-04-24T11:57:10Z","1067431600","3834831","119.915244","0.430617"
 ```
 
 ### JSON
@@ -184,64 +200,78 @@ $ ./llama-bench -o json
 ```json
 [
   {
-    "build_commit": "3469684",
-    "build_number": 1275,
-    "cuda": true,
-    "metal": false,
-    "gpu_blas": true,
-    "blas": true,
-    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
-    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
-    "model_filename": "models/7B/ggml-model-q4_0.gguf",
-    "model_type": "llama 7B mostly Q4_0",
-    "model_size": 3825065984,
-    "model_n_params": 6738415616,
-    "n_batch": 512,
-    "n_threads": 16,
-    "f16_kv": true,
+    "build_commit": "8cf427ff",
+    "build_number": 5163,
+    "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
+    "gpu_info": "NVIDIA GeForce RTX 4080",
+    "backends": "CUDA",
+    "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
+    "model_type": "qwen2 7B Q4_K - Medium",
+    "model_size": 4677120000,
+    "model_n_params": 7615616512,
+    "n_batch": 2048,
+    "n_ubatch": 512,
+    "n_threads": 8,
+    "cpu_mask": "0x0",
+    "cpu_strict": false,
+    "poll": 50,
+    "type_k": "f16",
+    "type_v": "f16",
     "n_gpu_layers": 99,
+    "split_mode": "layer",
     "main_gpu": 0,
-    "mul_mat_q": true,
+    "no_kv_offload": false,
+    "flash_attn": false,
     "tensor_split": "0.00",
+    "use_mmap": true,
+    "embeddings": false,
     "n_prompt": 512,
     "n_gen": 0,
-    "test_time": "2023-09-23T12:09:57Z",
-    "avg_ns": 212365953,
-    "stddev_ns": 985423,
-    "avg_ts": 2410.974041,
-    "stddev_ts": 11.163766,
-    "samples_ns": [ 213837238, 211635853, 212328053, 211329715, 212698907 ],
-    "samples_ts": [ 2394.34, 2419.25, 2411.36, 2422.75, 2407.16 ]
+    "n_depth": 0,
+    "test_time": "2025-04-24T11:58:50Z",
+    "avg_ns": 72135640,
+    "stddev_ns": 1453752,
+    "avg_ts": 7100.002165,
+    "stddev_ts": 140.341520,
+    "samples_ns": [ 74601900, 71632900, 71745200, 71952700, 70745500 ],
+    "samples_ts": [ 6863.1, 7147.55, 7136.37, 7115.79, 7237.21 ]
   },
   {
-    "build_commit": "3469684",
-    "build_number": 1275,
-    "cuda": true,
-    "metal": false,
-    "gpu_blas": true,
-    "blas": true,
-    "cpu_info": "13th Gen Intel(R) Core(TM) i9-13900K",
-    "gpu_info": "NVIDIA GeForce RTX 3090 Ti",
-    "model_filename": "models/7B/ggml-model-q4_0.gguf",
-    "model_type": "llama 7B mostly Q4_0",
-    "model_size": 3825065984,
-    "model_n_params": 6738415616,
-    "n_batch": 512,
-    "n_threads": 16,
-    "f16_kv": true,
+    "build_commit": "8cf427ff",
+    "build_number": 5163,
+    "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor",
+    "gpu_info": "NVIDIA GeForce RTX 4080",
+    "backends": "CUDA",
+    "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf",
+    "model_type": "qwen2 7B Q4_K - Medium",
+    "model_size": 4677120000,
+    "model_n_params": 7615616512,
+    "n_batch": 2048,
+    "n_ubatch": 512,
+    "n_threads": 8,
+    "cpu_mask": "0x0",
+    "cpu_strict": false,
+    "poll": 50,
+    "type_k": "f16",
+    "type_v": "f16",
     "n_gpu_layers": 99,
+    "split_mode": "layer",
     "main_gpu": 0,
-    "mul_mat_q": true,
+    "no_kv_offload": false,
+    "flash_attn": false,
     "tensor_split": "0.00",
+    "use_mmap": true,
+    "embeddings": false,
     "n_prompt": 0,
     "n_gen": 128,
-    "test_time": "2023-09-23T12:09:59Z",
-    "avg_ns": 977425219,
-    "stddev_ns": 9268593,
-    "avg_ts": 130.965708,
-    "stddev_ts": 1.238924,
-    "samples_ns": [ 984472709, 974901233, 989474741, 970729355, 967548060 ],
-    "samples_ts": [ 130.019, 131.295, 129.362, 131.86, 132.293 ]
+    "n_depth": 0,
+    "test_time": "2025-04-24T11:58:51Z",
+    "avg_ns": 1076767880,
+    "stddev_ns": 9449585,
+    "avg_ts": 118.881588,
+    "stddev_ts": 1.041811,
+    "samples_ns": [ 1075361300, 1065089400, 1071761200, 1081934900, 1089692600 ],
+    "samples_ts": [ 119.03, 120.178, 119.43, 118.307, 117.464 ]
   }
 ]
 ```
@@ -254,8 +284,8 @@ $ ./llama-bench -o jsonl
 ```
 
 ```json lines
-{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":512,"n_gen":0,"test_time":"2023-09-23T12:09:57Z","avg_ns":212365953,"stddev_ns":985423,"avg_ts":2410.974041,"stddev_ts":11.163766,"samples_ns":[213837238,211635853,212328053,211329715,212698907],"samples_ts":[2394.34,2419.25,2411.36,2422.75,2407.16]}
-{"build_commit":"3469684","build_number":1275,"cuda":true,"metal":false,"gpu_blas":true,"blas":true,"cpu_info":"13th Gen Intel(R) Core(TM) i9-13900K","gpu_info":"NVIDIA GeForce RTX 3090 Ti","model_filename":"models/7B/ggml-model-q4_0.gguf","model_type":"llama 7B mostly Q4_0","model_size":3825065984,"model_n_params":6738415616,"n_batch":512,"n_threads":16,"f16_kv":true,"n_gpu_layers":99,"main_gpu":0,"mul_mat_q":true,"tensor_split":"0.00","n_prompt":0,"n_gen":128,"test_time":"2023-09-23T12:09:59Z","avg_ns":977425219,"stddev_ns":9268593,"avg_ts":130.965708,"stddev_ts":1.238924,"samples_ns":[984472709,974901233,989474741,970729355,967548060],"samples_ts":[130.019,131.295,129.362,131.86,132.293]}
+{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 512, "n_gen": 0, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 70497220, "stddev_ns": 883196, "avg_ts": 7263.609157, "stddev_ts": 90.940578, "samples_ns": [ 71551000, 71222800, 70364100, 69439100, 69909100 ],"samples_ts": [ 7155.74, 7188.71, 7276.44, 7373.37, 7323.8 ]}
+{"build_commit": "8cf427ff", "build_number": 5163, "cpu_info": "AMD Ryzen 7 7800X3D 8-Core Processor", "gpu_info": "NVIDIA GeForce RTX 4080", "backends": "CUDA", "model_filename": "models/Qwen2.5-7B-Instruct-Q4_K_M.gguf", "model_type": "qwen2 7B Q4_K - Medium", "model_size": 4677120000, "model_n_params": 7615616512, "n_batch": 2048, "n_ubatch": 512, "n_threads": 8, "cpu_mask": "0x0", "cpu_strict": false, "poll": 50, "type_k": "f16", "type_v": "f16", "n_gpu_layers": 99, "split_mode": "layer", "main_gpu": 0, "no_kv_offload": false, "flash_attn": false, "tensor_split": "0.00", "use_mmap": true, "embeddings": false, "n_prompt": 0, "n_gen": 128, "n_depth": 0, "test_time": "2025-04-24T11:59:33Z", "avg_ns": 1068078400, "stddev_ns": 6279455, "avg_ts": 119.844681, "stddev_ts": 0.699739, "samples_ns": [ 1066331700, 1064864900, 1079042600, 1063328400, 1066824400 ],"samples_ts": [ 120.038, 120.203, 118.624, 120.377, 119.982 ]}
 ```
 
 
@@ -271,25 +301,32 @@ $ ./llama-bench -o sql
 CREATE TABLE IF NOT EXISTS test (
   build_commit TEXT,
   build_number INTEGER,
-  cuda INTEGER,
-  metal INTEGER,
-  gpu_blas INTEGER,
-  blas INTEGER,
   cpu_info TEXT,
   gpu_info TEXT,
+  backends TEXT,
   model_filename TEXT,
   model_type TEXT,
   model_size INTEGER,
   model_n_params INTEGER,
   n_batch INTEGER,
+  n_ubatch INTEGER,
   n_threads INTEGER,
-  f16_kv INTEGER,
+  cpu_mask TEXT,
+  cpu_strict INTEGER,
+  poll INTEGER,
+  type_k TEXT,
+  type_v TEXT,
   n_gpu_layers INTEGER,
+  split_mode TEXT,
   main_gpu INTEGER,
-  mul_mat_q INTEGER,
+  no_kv_offload INTEGER,
+  flash_attn INTEGER,
   tensor_split TEXT,
+  use_mmap INTEGER,
+  embeddings INTEGER,
   n_prompt INTEGER,
   n_gen INTEGER,
+  n_depth INTEGER,
   test_time TEXT,
   avg_ns INTEGER,
   stddev_ns INTEGER,
@@ -297,6 +334,6 @@ CREATE TABLE IF NOT EXISTS test (
   stddev_ts REAL
 );
 
-INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '512', '0', '2023-09-23T12:10:30Z', '212693772', '743623', '2407.240204', '8.409634');
-INSERT INTO test (build_commit, build_number, cuda, metal, gpu_blas, blas, cpu_info, gpu_info, model_filename, model_type, model_size, model_n_params, n_batch, n_threads, f16_kv, n_gpu_layers, main_gpu, mul_mat_q, tensor_split, n_prompt, n_gen, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('3469684', '1275', '1', '0', '0', '1', '1', '13th Gen Intel(R) Core(TM) i9-13900K', 'NVIDIA GeForce RTX 3090 Ti', 'models/7B/ggml-model-q4_0.gguf', 'llama 7B mostly Q4_0', '3825065984', '6738415616', '512', '16', '1', '99', '0', '1', '0.00', '0', '128', '2023-09-23T12:10:31Z', '977925003', '4037361', '130.891159', '0.537692');
+INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '512', '0', '0', '2025-04-24T12:00:08Z', '69905000', '519516', '7324.546977', '54.032613');
+INSERT INTO test (build_commit, build_number, cpu_info, gpu_info, backends, model_filename, model_type, model_size, model_n_params, n_batch, n_ubatch, n_threads, cpu_mask, cpu_strict, poll, type_k, type_v, n_gpu_layers, split_mode, main_gpu, no_kv_offload, flash_attn, tensor_split, use_mmap, embeddings, n_prompt, n_gen, n_depth, test_time, avg_ns, stddev_ns, avg_ts, stddev_ts) VALUES ('8cf427ff', '5163', 'AMD Ryzen 7 7800X3D 8-Core Processor', 'NVIDIA GeForce RTX 4080', 'CUDA', 'models/Qwen2.5-7B-Instruct-Q4_K_M.gguf', 'qwen2 7B Q4_K - Medium', '4677120000', '7615616512', '2048', '512', '8', '0x0', '0', '50', 'f16', 'f16', '99', 'layer', '0', '0', '0', '0.00', '1', '0', '0', '128', '0', '2025-04-24T12:00:09Z', '1063608780', '4464130', '120.346696', '0.504647');
 ```
diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp
index 564a51bfd7b6c..5a78216e44fa4 100644
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@@ -200,6 +200,7 @@ struct cmd_params {
     std::vector<int>                 n_prompt;
     std::vector<int>                 n_gen;
     std::vector<std::pair<int, int>> n_pg;
+    std::vector<int>                 n_depth;
     std::vector<int>                 n_batch;
     std::vector<int>                 n_ubatch;
     std::vector<ggml_type>           type_k;
@@ -233,6 +234,7 @@ static const cmd_params cmd_params_defaults = {
     /* n_prompt             */ { 512 },
     /* n_gen                */ { 128 },
     /* n_pg                 */ {},
+    /* n_depth              */ { 0 },
     /* n_batch              */ { 2048 },
     /* n_ubatch             */ { 512 },
     /* type_k               */ { GGML_TYPE_F16 },
@@ -272,6 +274,7 @@ static void print_usage(int /* argc */, char ** argv) {
     printf("  -n, --n-gen <n>                           (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
     printf("  -pg <pp,tg>                               (default: %s)\n",
            join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
+    printf("  -d, --n-depth <n>                         (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str());
     printf("  -b, --batch-size <n>                      (default: %s)\n",
            join(cmd_params_defaults.n_batch, ",").c_str());
     printf("  -ub, --ubatch-size <n>                    (default: %s)\n",
@@ -409,6 +412,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
                 break;
             }
             params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
+        } else if (arg == "-d" || arg == "--n-depth") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            auto p = string_split<int>(argv[i], split_delim);
+            params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
         } else if (arg == "-b" || arg == "--batch-size") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -739,6 +749,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
     if (params.n_pg.empty()) {
         params.n_pg = cmd_params_defaults.n_pg;
     }
+    if (params.n_depth.empty()) {
+        params.n_depth = cmd_params_defaults.n_depth;
+    }
     if (params.n_batch.empty()) {
         params.n_batch = cmd_params_defaults.n_batch;
     }
@@ -801,6 +814,7 @@ struct cmd_params_instance {
     std::string        model;
     int                n_prompt;
     int                n_gen;
+    int                n_depth;
     int                n_batch;
     int                n_ubatch;
     ggml_type          type_k;
@@ -880,7 +894,7 @@ struct cmd_params_instance {
     llama_context_params to_llama_cparams() const {
         llama_context_params cparams = llama_context_default_params();
 
-        cparams.n_ctx       = n_prompt + n_gen;
+        cparams.n_ctx       = n_prompt + n_gen + n_depth;
         cparams.n_batch     = n_batch;
         cparams.n_ubatch    = n_ubatch;
         cparams.type_k      = type_k;
@@ -916,6 +930,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
     for (const auto & nt : params.n_threads)
     for (const auto & cm : params.cpu_mask)
     for (const auto & cs : params.cpu_strict)
+    for (const auto & nd : params.n_depth)
     for (const auto & pl : params.poll) {
         for (const auto & n_prompt : params.n_prompt) {
             if (n_prompt == 0) {
@@ -925,6 +940,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .model        = */ m,
                 /* .n_prompt     = */ n_prompt,
                 /* .n_gen        = */ 0,
+                /* .n_depth      = */ nd,
                 /* .n_batch      = */ nb,
                 /* .n_ubatch     = */ nub,
                 /* .type_k       = */ tk,
@@ -955,6 +971,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .model        = */ m,
                 /* .n_prompt     = */ 0,
                 /* .n_gen        = */ n_gen,
+                /* .n_depth      = */ nd,
                 /* .n_batch      = */ nb,
                 /* .n_ubatch     = */ nub,
                 /* .type_k       = */ tk,
@@ -985,6 +1002,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
                 /* .model        = */ m,
                 /* .n_prompt     = */ n_pg.first,
                 /* .n_gen        = */ n_pg.second,
+                /* .n_depth      = */ nd,
                 /* .n_batch      = */ nb,
                 /* .n_ubatch     = */ nub,
                 /* .type_k       = */ tk,
@@ -1040,6 +1058,7 @@ struct test {
     bool                     embeddings;
     int                      n_prompt;
     int                      n_gen;
+    int                      n_depth;
     std::string              test_time;
     std::vector<uint64_t>    samples_ns;
 
@@ -1072,6 +1091,7 @@ struct test {
         embeddings     = inst.embeddings;
         n_prompt       = inst.n_prompt;
         n_gen          = inst.n_gen;
+        n_depth        = inst.n_depth;
         // RFC 3339 date-time format
         time_t t       = time(NULL);
         std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
@@ -1113,9 +1133,11 @@ struct test {
             "build_commit", "build_number", "cpu_info",       "gpu_info",   "backends",     "model_filename",
             "model_type",   "model_size",   "model_n_params", "n_batch",    "n_ubatch",     "n_threads",
             "cpu_mask",     "cpu_strict",   "poll",           "type_k",     "type_v",       "n_gpu_layers",
+            "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "use_mmap",
+            "embeddings",   "n_prompt",     "n_gen",          "n_depth",    "test_time",    "avg_ns",
             "split_mode",   "main_gpu",     "no_kv_offload",  "flash_attn", "tensor_split", "tensor_buft_overrides",
-            "use_mmap",     "embeddings",   "n_prompt",       "n_gen",      "test_time",    "avg_ns",
-            "stddev_ns",    "avg_ts",       "stddev_ts",
+            "use_mmap",     "embeddings",   "n_prompt",       "n_gen",      "n_depth",      "test_time",
+            "avg_ns",       "stddev_ns",    "avg_ts",         "stddev_ts",
         };
         return fields;
     }
@@ -1125,8 +1147,8 @@ struct test {
     static field_type get_field_type(const std::string & field) {
         if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
             field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
-            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
-            field == "stddev_ns") {
+            field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
+            field == "avg_ns" || field == "stddev_ns") {
             return INT;
         }
         if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
@@ -1204,6 +1226,7 @@ struct test {
                                             std::to_string(embeddings),
                                             std::to_string(n_prompt),
                                             std::to_string(n_gen),
+                                            std::to_string(n_depth),
                                             test_time,
                                             std::to_string(avg_ns()),
                                             std::to_string(stdev_ns()),
@@ -1381,7 +1404,7 @@ struct markdown_printer : public printer {
             return 4;
         }
         if (field == "test") {
-            return 13;
+            return 15;
         }
 
         int width = std::max((int) field.length(), 10);
@@ -1531,6 +1554,10 @@ struct markdown_printer : public printer {
                 } else {
                     snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
                 }
+                if (t.n_depth > 0) {
+                    int len = strlen(buf);
+                    snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth);
+                }
                 value = buf;
             } else if (field == "t/s") {
                 snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
@@ -1789,6 +1816,14 @@ int main(int argc, char ** argv) {
         for (int i = 0; i < params.reps; i++) {
             llama_kv_self_clear(ctx);
 
+            if (t.n_depth > 0) {
+                if (params.progress) {
+                    fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
+                            i + 1, params.reps);
+                }
+                test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
+            }
+
             uint64_t t_start = get_time_ns();
 
             if (t.n_prompt > 0) {

From 43ddab6eeeaab5a04fe5a364af0bafb0e4d35065 Mon Sep 17 00:00:00 2001
From: Ville Vesilehto <ville@vesilehto.fi>
Date: Mon, 28 Apr 2025 21:00:20 +0300
Subject: [PATCH 055/200] fix(rpc): Improve input validation and error handling
 (#13069)

* fix(rpc): Improve input validation and error handling

The `rpc-server` was vulnerable to Denial of Service attacks via
several RPC commands (`SET_TENSOR`, `GRAPH_COMPUTE`, etc.). Malformed
messages could trigger failed assertions (e.g., invalid `ggml_type`)
or out-of-bounds reads/writes leading to `GGML_ABORT` calls,
crashing the server process.

This PR introduces robust input validation and replaces `abort()`
calls with graceful error handling:

- **Type Validation:** `deserialize_tensor` now checks if the
  `tensor->type` is within the valid `GGML_TYPE_COUNT` range
  *before* calling `ggml_new_tensor_4d`. Returns `nullptr` on
  invalid type.
- **Bounds Checks:** Replaced `GGML_ABORT` in `set_tensor`,
  `set_tensor_hash`, and `get_tensor` handlers with error
  logging and returning `false` when data/offset parameters
  are out of buffer bounds.
- **Size Checks:** Added safe arithmetic checks (for overflow) in
  `graph_compute` when calculating required message sizes based
  on client-provided `n_nodes` and `n_tensors`. Returns early
  if the reported sizes conflict with the actual message size or
  would lead to overflow.
- **Error Propagation:**
    - `create_node` now checks for `nullptr` return values from
      `deserialize_tensor` and its recursive calls, propagating
      `nullptr` upwards on failure. Uses `find` instead of `at`
      for safer map access.
    - `copy_tensor` now checks for `nullptr` from `deserialize_tensor`
      and sets the response status to failure if deserialization
      or bounds checks fail.
    - `graph_compute` now checks for `nullptr` return from
      `create_node` and returns failure status correctly. The final
      return value now reflects the actual computation status.

These changes improve the RPC server's resilience
against malformed client requests, preventing crashes and ensuring
errors are handled more gracefully.

Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>

* refactor(rpc): address pr comments

removed comments and unnecessary returns

Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>

* refactor(rpc): ambiguous nullptr from create_node

rpc_server::create_node could previously return nullptr if the input ID
was 0 (valid) or if an internal error (deserialization, recursion
failure) occurred (invalid). This ambiguity made error handling
difficult for the caller (`graph_compute`).

This commit clarifies the meaning of nullptr:
- `graph_compute` now checks if the input 'id' was non-zero when
  `create_node` returns nullptr, correctly identifying failures
  versus intentional null links.
- `create_node` avoids recursive calls for zero IDs and propagates
  nullptr unambiguously on failure during recursion.

Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>

* refactor(rpc): initial zero check in create_node

The caller (`graph_compute`) already checks `id != 0` when handling
a `nullptr` return from `create_node`, correctly distinguishing
intentional null links from actual errors. This makes the initial
`if (id == 0)` check redundant.

Also removes the log message when a tensor ID is not found in the
provided map which was added in this branch.

Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>

* fix(rpc): Handle get_alloc_size failure in server

Check the return value of `server.get_alloc_size` in the RPC server
loop. If the call fails, return early to close the connection.

Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>

* refactor(rpc): input size validation in graph_compute

Removes detailed, step-by-step size calculations and overflow
checks in favor of simpler direct comparisons, assuming 64-bit
overflow is unlikely.

Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>

* refactor(rpc): remove extra status code setting

Removes the explicit setting of `response.result = GGML_STATUS_FAILED`
when `create_node` returns `nullptr` within `graph_compute`.
Primary signal is the `false` return value in case of failure.

Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>

* refactor(rpc): remove redundant check for tensor->type

Breaks CI on ubuntu-cpu-make. Tensor type is uint32_t, thus
the check is not needed.

Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>

---------

Signed-off-by: Ville Vesilehto <ville@vesilehto.fi>
---
 ggml/src/ggml-rpc/ggml-rpc.cpp | 78 +++++++++++++++++++++++++++++-----
 1 file changed, 68 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-rpc/ggml-rpc.cpp b/ggml/src/ggml-rpc/ggml-rpc.cpp
index 9023eb0919690..140a775f9806f 100644
--- a/ggml/src/ggml-rpc/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc/ggml-rpc.cpp
@@ -982,8 +982,21 @@ bool rpc_server::buffer_clear(const rpc_msg_buffer_clear_req & request) {
 }
 
 ggml_tensor * rpc_server::deserialize_tensor(struct ggml_context * ctx, const rpc_tensor * tensor) {
+    // Validate tensor type before using it
+    if (tensor->type >= GGML_TYPE_COUNT) {
+        GGML_LOG_ERROR("[%s] invalid tensor type received: %u\n", __func__, tensor->type);
+        return nullptr;
+    }
+
     ggml_tensor * result = ggml_new_tensor_4d(ctx, (ggml_type) tensor->type,
         tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3]);
+
+    // ggml_new_tensor_4d might fail if dimensions are invalid, although less likely to crash than invalid type
+    if (result == nullptr) {
+        GGML_LOG_ERROR("[%s] ggml_new_tensor_4d failed for type %u\\n", __func__, tensor->type);
+        return nullptr;
+    }
+
     for (uint32_t i = 0; i < GGML_MAX_DIMS; i++) {
         result->nb[i] = tensor->nb[i];
     }
@@ -1043,7 +1056,9 @@ bool rpc_server::set_tensor(const std::vector<uint8_t> & input) {
         const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
 
         if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
-            GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
+            GGML_LOG_ERROR("[%s] tensor data region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%zu) out of buffer bounds [0x%zx, 0x%zx)\n",
+                           __func__, in_tensor->data, offset, size, p0, p1);
+            return false;
         }
     }
 
@@ -1118,7 +1133,9 @@ bool rpc_server::set_tensor_hash(const std::vector<uint8_t> & input, rpc_msg_set
         const size_t p1 = p0 + ggml_backend_buffer_get_size(tensor->buffer);
 
         if (in_tensor->data + offset < p0 || in_tensor->data + offset >= p1 || size > (p1 - in_tensor->data - offset)) {
-            GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
+            GGML_LOG_ERROR("[%s] tensor data region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%zu, hash=0x%" PRIx64 ") out of buffer bounds [0x%zx, 0x%zx)\n",
+                           __func__, in_tensor->data, offset, size, *hash, p0, p1);
+            return false;
         }
     }
     ggml_backend_tensor_set(tensor, cached_file.data(), offset, size);
@@ -1183,7 +1200,9 @@ bool rpc_server::get_tensor(const rpc_msg_get_tensor_req & request, std::vector<
         if (request.tensor.data + request.offset < p0 ||
             request.tensor.data + request.offset >= p1 ||
             request.size > (p1 - request.tensor.data - request.offset)) {
-                GGML_ABORT("[%s] tensor->data out of bounds\n", __func__);
+                GGML_LOG_ERROR("[%s] requested tensor region (data=0x%" PRIx64 ", offset=%" PRIu64 ", size=%" PRIu64 ") out of buffer bounds [0x%zx, 0x%zx)\n",
+                               __func__, request.tensor.data, request.offset, request.size, p0, p1);
+                return false;
         }
     }
 
@@ -1237,22 +1256,50 @@ ggml_tensor * rpc_server::create_node(uint64_t id,
                                       struct ggml_context * ctx,
                                       const std::unordered_map<uint64_t, const rpc_tensor*> & tensor_ptrs,
                                       std::unordered_map<uint64_t, struct ggml_tensor*> & tensor_map) {
-    if (id == 0) {
-        return nullptr;
-    }
     if (tensor_map.find(id) != tensor_map.end()) {
         return tensor_map[id];
     }
-    const rpc_tensor * tensor = tensor_ptrs.at(id);
+    // Safely find the tensor pointer
+    auto it_ptr = tensor_ptrs.find(id);
+    if (it_ptr == tensor_ptrs.end()) {
+        return nullptr;
+    }
+    const rpc_tensor * tensor = it_ptr->second;
+
     struct ggml_tensor * result = deserialize_tensor(ctx, tensor);
     if (result == nullptr) {
         return nullptr;
     }
     tensor_map[id] = result;
     for (int i = 0; i < GGML_MAX_SRC; i++) {
-        result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
+        // Check if the source ID is 0 before calling create_node recursively
+        if (tensor->src[i] == 0) {
+            result->src[i] = nullptr;
+        } else {
+            result->src[i] = create_node(tensor->src[i], ctx, tensor_ptrs, tensor_map);
+            // If the recursive call failed for a non-zero ID, propagate the error
+            if (result->src[i] == nullptr) {
+                GGML_LOG_ERROR("[%s] failed to create source node %d (src_id=%" PRIu64 ") for node id %" PRIu64 "\n",
+                               __func__, i, tensor->src[i], id);
+                // Must return nullptr to signal failure up the call stack
+                return nullptr;
+            }
+        }
+    }
+
+    // Handle view_src similarly
+    if (tensor->view_src == 0) {
+        result->view_src = nullptr;
+    } else {
+        result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map);
+        // If the recursive call failed for a non-zero ID, propagate the error
+        if (result->view_src == nullptr) {
+            GGML_LOG_ERROR("[%s] failed to create view_src node (view_src_id=%" PRIu64 ") for node id %" PRIu64 "\n",
+                           __func__, tensor->view_src, id);
+            // Must return nullptr to signal failure up the call stack
+            return nullptr;
+        }
     }
-    result->view_src = create_node(tensor->view_src, ctx, tensor_ptrs, tensor_map);
     result->view_offs = tensor->view_offs;
     return result;
 }
@@ -1278,6 +1325,7 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph
     GGML_PRINT_DEBUG("[%s] n_nodes: %u, n_tensors: %u\n", __func__, n_nodes, n_tensors);
 
     size_t buf_size = ggml_tensor_overhead()*(n_nodes + n_tensors) + ggml_graph_overhead_custom(n_nodes, false);
+
     struct ggml_init_params params = {
         /*.mem_size   =*/ buf_size,
         /*.mem_buffer =*/ NULL,
@@ -1297,6 +1345,14 @@ bool rpc_server::graph_compute(const std::vector<uint8_t> & input, rpc_msg_graph
         int64_t id;
         memcpy(&id, &nodes[i], sizeof(id));
         graph->nodes[i] = create_node(id, ctx, tensor_ptrs, tensor_map);
+
+        // Check if create_node failed for a *non-zero* ID.
+        // If id was 0, create_node returning nullptr is expected.
+        // If id was non-zero and create_node returned nullptr, it indicates a deserialization error.
+        if (graph->nodes[i] == nullptr && id != 0) {
+            GGML_LOG_ERROR("[%s] failed to create graph node %d (id=%" PRId64 ")\n", __func__, i, id);
+            return false;
+        }
     }
     ggml_status status = ggml_backend_graph_compute(backend, graph);
     response.result = status;
@@ -1361,7 +1417,9 @@ static void rpc_serve_client(ggml_backend_t backend, const char * cache_dir,
                     return;
                 }
                 rpc_msg_get_alloc_size_rsp response;
-                server.get_alloc_size(request, response);
+                if (!server.get_alloc_size(request, response)) {
+                    return;
+                }
                 if (!send_msg(sockfd, &response, sizeof(response))) {
                     return;
                 }

From eaea3253244dc4bbe07f6cd81325847ccc6cf93e Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Mon, 28 Apr 2025 21:23:19 +0200
Subject: [PATCH 056/200] clip : fix model size display (#13153)

---
 examples/llava/clip.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 8c5d56cc17ae9..a5eb55f4d412d 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1574,7 +1574,7 @@ struct clip_model_loader {
     clip_ctx & ctx_clip;
     std::string fname;
 
-    size_t model_size; // in bytes
+    size_t model_size = 0; // in bytes
 
     // TODO @ngxson : we should not pass clip_ctx here, it should be clip_vision_model
     clip_model_loader(const char * fname, clip_ctx & ctx_clip) : ctx_clip(ctx_clip), fname(fname) {
@@ -1748,6 +1748,8 @@ struct clip_model_loader {
             LOG_INF("%s: minicpmv_version:   %d\n", __func__, ctx_clip.minicpmv_version);
             LOG_INF("%s: proj_scale_factor:  %d\n", __func__, hparams.proj_scale_factor);
             LOG_INF("%s: n_wa_pattern:       %d\n", __func__, hparams.n_wa_pattern);
+            LOG_INF("%s: use_silu:           %d\n", __func__, ctx_clip.use_silu);
+            LOG_INF("%s: use_gelu:           %d\n", __func__, ctx_clip.use_gelu);
             LOG_INF("%s: model size:         %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
             LOG_INF("%s: metadata size:      %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
         }

From 5f5e39e1ba5dbea814e41f2a15e035d749a520bc Mon Sep 17 00:00:00 2001
From: AT <manyoso@users.noreply.github.com>
Date: Mon, 28 Apr 2025 15:52:15 -0400
Subject: [PATCH 057/200] model : Nomic Embed Text V2 with Mixture-of-Experts
 (MoE) architecture (#12466)

* Nomic Embed Text V2 with Mixture-of-Experts (MoE) architecture

- Adds MoE-based embedding model supporting multilingual embeddings.
- Selects architecture variant based on hyperparameter detection (MoE layers).
- Removes unnecessary subclass initialization checks for clarity.

https://www.nomic.ai/blog/posts/nomic-embed-text-v2

Co-authored-by: Jared Van Bortel <jared@nomic.ai>

* fix tokenizer

* don't rename this tensor

---------

Co-authored-by: Jared Van Bortel <jared@nomic.ai>
---
 convert_hf_to_gguf.py          | 227 ++++++++++++++++++++-------------
 gguf-py/gguf/constants.py      |  19 +++
 gguf-py/gguf/gguf_writer.py    |   3 +
 gguf-py/gguf/tensor_mapping.py |   4 +
 src/llama-arch.cpp             |  20 +++
 src/llama-arch.h               |   2 +
 src/llama-graph.cpp            |  25 ++--
 src/llama-hparams.h            |   1 +
 src/llama-model.cpp            |  56 ++++++--
 9 files changed, 247 insertions(+), 110 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index d4fec408dd202..b9cea7e4699c6 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -78,7 +78,7 @@ class ModelBase:
     # subclasses should define this!
     model_arch: gguf.MODEL_ARCH
 
-    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool = False,
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, *, is_big_endian: bool = False,
                  use_temp_file: bool = False, eager: bool = False,
                  metadata_override: Path | None = None, model_name: str | None = None,
                  split_max_tensors: int = 0, split_max_size: int = 0, dry_run: bool = False,
@@ -454,13 +454,6 @@ def from_model_architecture(cls, arch: str, model_type = ModelType.TEXT) -> type
 
 
 class TextModel(ModelBase):
-    @classmethod
-    def __init_subclass__(cls):
-        # can't use an abstract property, because overriding it without type errors
-        # would require using decorated functions instead of simply defining the property
-        if "model_arch" not in cls.__dict__:
-            raise TypeError(f"Missing property 'model_arch' for {cls.__name__!r}")
-
     def set_vocab(self):
         self._set_vocab_gpt2()
 
@@ -3373,14 +3366,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         return [(self.map_tensor_name(name), data_torch)]
 
-
-@ModelBase.register("RobertaModel")
-class RobertaModel(BertModel):
-    model_arch = gguf.MODEL_ARCH.BERT
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
+    def _xlmroberta_tokenizer_init(self) -> None:
         # we need the pad_token_id to know how to chop down position_embd matrix
         if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
             self._position_offset = 1 + pad_token_id
@@ -3389,82 +3375,7 @@ def __init__(self, *args, **kwargs):
         else:
             self._position_offset = None
 
-    def set_vocab(self):
-        """Support BPE tokenizers for roberta models"""
-        bpe_tok_path = self.dir_model / "tokenizer.json"
-        if bpe_tok_path.exists():
-            self._set_vocab_gpt2()
-            self.gguf_writer.add_add_bos_token(True)
-            self.gguf_writer.add_add_eos_token(True)
-
-            # we need this to validate the size of the token_type embeddings
-            # though currently we are passing all zeros to the token_type embeddings
-            # "Sequence A" or "Sequence B"
-            self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
-
-        else:
-            return super().set_vocab()
-
-    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # if name starts with "roberta.", remove the prefix
-        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
-        if name.startswith("roberta."):
-            name = name[8:]
-
-        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
-        if name == "embeddings.position_embeddings.weight":
-            if self._position_offset is not None:
-                data_torch = data_torch[self._position_offset:,:]
-
-        return super().modify_tensors(data_torch, name, bid)
-
-
-@ModelBase.register("NomicBertModel")
-class NomicBertModel(BertModel):
-    model_arch = gguf.MODEL_ARCH.NOMIC_BERT
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # the HF config claims n_ctx=8192, but it uses RoPE scaling
-        self.hparams["n_ctx"] = 2048
-
-        # SwigLU activation
-        assert self.hparams["activation_function"] == "swiglu"
-        # this doesn't do anything in the HF version
-        assert self.hparams["causal"] is False
-        # no bias tensors
-        assert self.hparams["qkv_proj_bias"] is False
-        assert self.hparams["mlp_fc1_bias"] is False
-        assert self.hparams["mlp_fc2_bias"] is False
-        # norm at end of layer
-        assert self.hparams["prenorm"] is False
-        # standard RoPE
-        assert self.hparams["rotary_emb_fraction"] == 1.0
-        assert self.hparams["rotary_emb_interleaved"] is False
-        assert self.hparams["rotary_emb_scale_base"] is None
-
-    def set_gguf_parameters(self):
-        super().set_gguf_parameters()
-        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
-
-
-@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
-class XLMRobertaModel(BertModel):
-    model_arch = gguf.MODEL_ARCH.BERT
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        # we need the pad_token_id to know how to chop down position_embd matrix
-        if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
-            self._position_offset = 1 + pad_token_id
-            if "max_position_embeddings" in self.hparams:
-                self.hparams["max_position_embeddings"] -= self._position_offset
-        else:
-            self._position_offset = None
-
-    def set_vocab(self):
+    def _xlmroberta_set_vocab(self) -> None:
         # to avoid TypeError: Descriptors cannot be created directly
         # exception when importing sentencepiece_model_pb2
         os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
@@ -3546,6 +3457,138 @@ def set_vocab(self):
         self.gguf_writer.add_add_bos_token(True)
         self.gguf_writer.add_add_eos_token(True)
 
+
+@ModelBase.register("RobertaModel")
+class RobertaModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.BERT
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # we need the pad_token_id to know how to chop down position_embd matrix
+        if (pad_token_id := self.hparams.get("pad_token_id")) is not None:
+            self._position_offset = 1 + pad_token_id
+            if "max_position_embeddings" in self.hparams:
+                self.hparams["max_position_embeddings"] -= self._position_offset
+        else:
+            self._position_offset = None
+
+    def set_vocab(self):
+        """Support BPE tokenizers for roberta models"""
+        bpe_tok_path = self.dir_model / "tokenizer.json"
+        if bpe_tok_path.exists():
+            self._set_vocab_gpt2()
+            self.gguf_writer.add_add_bos_token(True)
+            self.gguf_writer.add_add_eos_token(True)
+
+            # we need this to validate the size of the token_type embeddings
+            # though currently we are passing all zeros to the token_type embeddings
+            # "Sequence A" or "Sequence B"
+            self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
+
+        else:
+            return super().set_vocab()
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # if name starts with "roberta.", remove the prefix
+        # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
+        if name.startswith("roberta."):
+            name = name[8:]
+
+        # position embeddings start at pad_token_id + 1, so just chop down the weight tensor
+        if name == "embeddings.position_embeddings.weight":
+            if self._position_offset is not None:
+                data_torch = data_torch[self._position_offset:,:]
+
+        return super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("NomicBertModel")
+class NomicBertModel(BertModel):
+    def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, **kwargs: Any):
+        hparams = kwargs.pop("hparams", None)
+        if hparams is None:
+            hparams = ModelBase.load_hparams(dir_model)
+
+        self.is_moe = bool(hparams.get("moe_every_n_layers"))
+        self.model_arch = gguf.MODEL_ARCH.NOMIC_BERT_MOE if self.is_moe else gguf.MODEL_ARCH.NOMIC_BERT
+
+        super().__init__(dir_model, ftype, fname_out, hparams=hparams, **kwargs)
+
+        self._tokenizer_is_xlmroberta = self._is_tokenizer_xlmroberta()
+        if self._tokenizer_is_xlmroberta:
+            self._xlmroberta_tokenizer_init()
+
+        # the HF config claims n_ctx=8192, but it uses RoPE scaling
+        self.hparams["n_ctx"] = 2048
+
+        assert self.hparams["activation_function"] == "gelu" if self.is_moe else "swiglu"
+
+        # this doesn't do anything in the HF version
+        assert self.hparams["causal"] is False
+        # no bias tensors unless MoE
+        assert self.hparams["qkv_proj_bias"] == self.is_moe
+        assert self.hparams["mlp_fc1_bias"]  == self.is_moe
+        assert self.hparams["mlp_fc2_bias"]  == self.is_moe
+
+        # norm at end of layer
+        assert self.hparams["prenorm"] is False
+        # standard RoPE
+        assert self.hparams["rotary_emb_fraction"] == 1.0
+        assert self.hparams["rotary_emb_interleaved"] is False
+        assert self.hparams["rotary_emb_scale_base"] is None
+
+    def set_vocab(self) -> None:
+        if self._tokenizer_is_xlmroberta:
+            return self._xlmroberta_set_vocab()
+        return super().set_vocab()
+
+    def modify_tensors(self, data_torch: torch.Tensor, name: str, bid: int | None) -> Iterable[tuple[str, torch.Tensor]]:
+        # If the tensor is an experts bias tensor, skip it by returning an empty list.
+        if "mlp.experts.bias" in name:
+            return []  # Explicitly return an empty list.
+
+        if "mlp.experts.mlp.w1" in name:
+            data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
+            name += ".weight"
+
+        if "mlp.experts.mlp.w2" in name:
+            data_torch = data_torch.view(self.hparams["num_experts"], self.hparams["n_inner"], self.hparams["n_embd"])
+            data_torch = data_torch.transpose(1, 2)
+            name += ".weight"
+
+        return [(self.map_tensor_name(name), data_torch)]
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])
+        if self.is_moe:
+            self.gguf_writer.add_moe_every_n_layers(self.hparams["moe_every_n_layers"])
+            self.gguf_writer.add_expert_count(self.hparams["num_experts"])
+            self.gguf_writer.add_expert_used_count(self.hparams["moe_top_k"])
+
+    def _is_tokenizer_xlmroberta(self) -> bool:
+        with open(self.dir_model / "tokenizer.json") as f:
+            tokenizer_json = json.load(f)
+        toktyp = tokenizer_json["model"]["type"]
+        if toktyp == "Unigram":
+            return True
+        if toktyp == "WordPiece":
+            return False
+        raise ValueError(f"unknown tokenizer: {toktyp}")
+
+
+@ModelBase.register("XLMRobertaModel", "XLMRobertaForSequenceClassification")
+class XLMRobertaModel(BertModel):
+    model_arch = gguf.MODEL_ARCH.BERT
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._xlmroberta_tokenizer_init()
+
+    def set_vocab(self):
+        self._xlmroberta_set_vocab()
+
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         # if name starts with "roberta.", remove the prefix
         # e.g. https://huggingface.co/BAAI/bge-reranker-v2-m3/tree/main
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index b81017b142583..326ccdb071a79 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -104,6 +104,7 @@ class LLM:
         EXPERT_WEIGHTS_SCALE              = "{arch}.expert_weights_scale"
         EXPERT_WEIGHTS_NORM               = "{arch}.expert_weights_norm"
         EXPERT_GATING_FUNC                = "{arch}.expert_gating_func"
+        MOE_EVERY_N_LAYERS                = "{arch}.moe_every_n_layers"
         POOLING_TYPE                      = "{arch}.pooling_type"
         LOGIT_SCALE                       = "{arch}.logit_scale"
         DECODER_START_TOKEN_ID            = "{arch}.decoder_start_token_id"
@@ -267,6 +268,7 @@ class MODEL_ARCH(IntEnum):
     REFACT           = auto()
     BERT             = auto()
     NOMIC_BERT       = auto()
+    NOMIC_BERT_MOE   = auto()
     JINA_BERT_V2     = auto()
     BLOOM            = auto()
     STABLELM         = auto()
@@ -521,6 +523,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.REFACT:           "refact",
     MODEL_ARCH.BERT:             "bert",
     MODEL_ARCH.NOMIC_BERT:       "nomic-bert",
+    MODEL_ARCH.NOMIC_BERT_MOE:   "nomic-bert-moe",
     MODEL_ARCH.JINA_BERT_V2:     "jina-bert-v2",
     MODEL_ARCH.BLOOM:            "bloom",
     MODEL_ARCH.STABLELM:         "stablelm",
@@ -960,6 +963,22 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_UP,
         MODEL_TENSOR.LAYER_OUT_NORM,
     ],
+    MODEL_ARCH.NOMIC_BERT_MOE: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.TOKEN_EMBD_NORM,
+        MODEL_TENSOR.TOKEN_TYPES,
+        MODEL_TENSOR.POS_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.ATTN_OUT_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.LAYER_OUT_NORM,
+    ],
     MODEL_ARCH.JINA_BERT_V2: [
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.TOKEN_EMBD_NORM,
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 48e9a470b78d6..f22a6d4a3472b 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -728,6 +728,9 @@ def add_expert_weights_norm(self, value: bool) -> None:
     def add_expert_gating_func(self, value: ExpertGatingFuncType) -> None:
         self.add_uint32(Keys.LLM.EXPERT_GATING_FUNC.format(arch=self.arch), value.value)
 
+    def add_moe_every_n_layers(self, value: int) -> None:
+        self.add_uint32(Keys.LLM.MOE_EVERY_N_LAYERS.format(arch=self.arch), value)
+
     def add_swin_norm(self, value: bool) -> None:
         self.add_bool(Keys.LLM.SWIN_NORM.format(arch=self.arch), value)
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 1d70551973b01..311d1ff69c799 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -290,6 +290,7 @@ class TensorNameMap:
             "transformer.blocks.{bid}.ffn.router.layer",        # dbrx
             "model.layers.{bid}.block_sparse_moe.router.layer", # granitemoe
             "language_model.model.layers.{bid}.feed_forward.router", # llama4
+            "encoder.layers.{bid}.mlp.router.layer",            # nomic-bert-moe
         ),
 
         MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
@@ -322,6 +323,7 @@ class TensorNameMap:
             "model.layers.layers.{bid}.mlp.up_proj",                  # plamo
             "model.layers.{bid}.feed_forward.w3",                     # internlm2
             "encoder.layers.{bid}.mlp.fc11",                          # nomic-bert
+            "encoder.layers.{bid}.mlp.fc1",                           # nomic-bert-moe
             "model.layers.{bid}.mlp.c_fc",                            # starcoder2
             "encoder.layer.{bid}.mlp.gated_layers_v",                 # jina-bert-v2
             "model.layers.{bid}.residual_mlp.w3",                     # arctic
@@ -337,6 +339,7 @@ class TensorNameMap:
             "model.layers.{bid}.mlp.experts.up_proj",         # qwen2moe olmoe (merged)
             "model.layers.{bid}.block_sparse_moe.experts.w3", # phimoe (merged)
             "language_model.model.layers.{bid}.feed_forward.experts.up_proj", # llama4
+            "encoder.layers.{bid}.mlp.experts.mlp.w1",        # nomic-bert-moe
         ),
 
         MODEL_TENSOR.FFN_UP_SHEXP: (
@@ -418,6 +421,7 @@ class TensorNameMap:
             "model.layers.{bid}.block_sparse_moe.output_linear", # granitemoe
             "model.layers.{bid}.block_sparse_moe.experts.w2",    # phimoe (merged)
             "language_model.model.layers.{bid}.feed_forward.experts.down_proj", # llama4
+            "encoder.layers.{bid}.mlp.experts.mlp.w2",           # nomic-bert-moe
         ),
 
         MODEL_TENSOR.FFN_DOWN_SHEXP: (
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 62e1480bb5881..f2bc8ca768502 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -19,6 +19,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_REFACT,           "refact"           },
     { LLM_ARCH_BERT,             "bert"             },
     { LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },
+    { LLM_ARCH_NOMIC_BERT_MOE,   "nomic-bert-moe"   },
     { LLM_ARCH_JINA_BERT_V2,     "jina-bert-v2"     },
     { LLM_ARCH_BLOOM,            "bloom"            },
     { LLM_ARCH_STABLELM,         "stablelm"         },
@@ -106,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_EXPERT_WEIGHTS_SCALE,              "%s.expert_weights_scale"              },
     { LLM_KV_EXPERT_WEIGHTS_NORM,               "%s.expert_weights_norm"               },
     { LLM_KV_EXPERT_GATING_FUNC,                "%s.expert_gating_func"                },
+    { LLM_KV_MOE_EVERY_N_LAYERS,                "%s.moe_every_n_layers"                },
     { LLM_KV_POOLING_TYPE,                      "%s.pooling_type"                      },
     { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       },
     { LLM_KV_DECODER_START_TOKEN_ID,            "%s.decoder_start_token_id"            },
@@ -472,6 +474,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_NOMIC_BERT_MOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
+            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
+            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+        },
+    },
     {
         LLM_ARCH_JINA_BERT_V2,
         {
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 98ca00a1bd0b0..41a023da3da6e 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -23,6 +23,7 @@ enum llm_arch {
     LLM_ARCH_REFACT,
     LLM_ARCH_BERT,
     LLM_ARCH_NOMIC_BERT,
+    LLM_ARCH_NOMIC_BERT_MOE,
     LLM_ARCH_JINA_BERT_V2,
     LLM_ARCH_BLOOM,
     LLM_ARCH_STABLELM,
@@ -110,6 +111,7 @@ enum llm_kv {
     LLM_KV_EXPERT_WEIGHTS_SCALE,
     LLM_KV_EXPERT_WEIGHTS_NORM,
     LLM_KV_EXPERT_GATING_FUNC,
+    LLM_KV_MOE_EVERY_N_LAYERS,
     LLM_KV_POOLING_TYPE,
     LLM_KV_LOGIT_SCALE,
     LLM_KV_DECODER_START_TOKEN_ID,
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index e6595fb18bc5b..2706ea2635444 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -925,28 +925,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
     ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
     cb(up, "ffn_moe_up", il);
 
-    ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
-    cb(gate, "ffn_moe_gate", il);
+    ggml_tensor * experts = nullptr;
+    if (gate_exps) {
+        cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
+        cb(cur, "ffn_moe_gate", il);
+    } else {
+        cur = up;
+    }
 
     switch (type_op) {
         case LLM_FFN_SILU:
             {
-                gate = ggml_silu(ctx0, gate);
-                cb(gate, "ffn_moe_silu", il);
+                cur = ggml_silu(ctx0, cur);
+                cb(cur, "ffn_moe_silu", il);
             } break;
         case LLM_FFN_GELU:
             {
-                gate = ggml_gelu(ctx0, gate);
-                cb(gate, "ffn_moe_gelu", il);
+                cur = ggml_gelu(ctx0, cur);
+                cb(cur, "ffn_moe_gelu", il);
             } break;
         default:
             GGML_ABORT("fatal error");
     }
 
-    ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
-    cb(par, "ffn_moe_gate_par", il);
+    if (gate_exps) {
+        cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
+        cb(cur, "ffn_moe_gate_par", il);
+    }
 
-    ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
+    experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
     cb(experts, "ffn_moe_down", il);
 
     if (!weight_before_ffn) {
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 80fcd65df0d3c..7ee6a5b75ad1e 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -66,6 +66,7 @@ struct llama_hparams {
     float    expert_weights_scale = 0.0;
     bool     expert_weights_norm  = false;
     uint32_t expert_gating_func   = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
+    uint32_t moe_every_n_layers   = 0;
 
     float f_norm_eps;
     float f_norm_rms_eps;
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index df2791002e9f9..2ec55d55a37be 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -695,10 +695,12 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                 }
             } break;
         case LLM_ARCH_NOMIC_BERT:
+        case LLM_ARCH_NOMIC_BERT_MOE:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,    hparams.f_norm_eps);
                 ml.get_key(LLM_KV_ATTENTION_CAUSAL,           hparams.causal_attn);
                 ml.get_key(LLM_KV_POOLING_TYPE,               hparams.pooling_type);
+                ml.get_key(LLM_KV_MOE_EVERY_N_LAYERS,         hparams.moe_every_n_layers, 0);
 
                 if (hparams.n_layer == 12 && hparams.n_embd == 768) {
                     type = LLM_TYPE_137M;
@@ -2057,6 +2059,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 } break;
             case LLM_ARCH_BERT:
             case LLM_ARCH_NOMIC_BERT:
+            case LLM_ARCH_NOMIC_BERT_MOE:
                 {
                     tok_embd     = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD,  "weight"), {n_embd, n_vocab}, 0);
                     type_embd    = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
@@ -2090,20 +2093,31 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                             layer.wqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, 0);
                         }
 
+                        if (arch == LLM_ARCH_NOMIC_BERT_MOE) {
+                            layer.bqkv = create_tensor(tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, 0);
+                        }
+
                         layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT,      "weight", i), {n_embd, n_embd}, 0);
 
                         layer.attn_out_norm   = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "weight", i), {n_embd}, 0);
                         layer.attn_out_norm_b = create_tensor(tn(LLM_TENSOR_ATTN_OUT_NORM, "bias", i),   {n_embd}, 0);
 
-                        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff}, 0);
-                        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN,      "weight", i), {n_ff, n_embd}, 0);
-
-                        if (arch == LLM_ARCH_BERT) {
+                        if (hparams.moe_every_n_layers > 0 && i % hparams.moe_every_n_layers == 1) {
                             layer.bo         = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
-                            layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, 0);
-                            layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+                            layer.ffn_up_exps   = create_tensor(tn(LLM_TENSOR_FFN_UP_EXPS,   "weight", i), {  n_embd, n_ff,   n_expert}, 0);
+                            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {  n_ff,   n_embd, n_expert}, 0);
+                            layer.ffn_gate_inp = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP,   "weight", i), {n_embd, n_expert}, 0);
                         } else {
-                            layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+                            layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,        "weight", i), {n_embd, n_ff}, 0);
+                            layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN,      "weight", i), {n_ff, n_embd}, 0);
+
+                            if (arch == LLM_ARCH_BERT || arch == LLM_ARCH_NOMIC_BERT_MOE) {
+                                layer.bo         = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, 0);
+                                layer.ffn_up_b   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "bias", i), {n_ff}, 0);
+                                layer.ffn_down_b = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, 0);
+                            } else {
+                                layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+                            }
                         }
 
                         layer.layer_out_norm   = create_tensor(tn(LLM_TENSOR_LAYER_OUT_NORM, "weight", i), {n_embd}, 0);
@@ -5730,6 +5744,11 @@ struct llm_build_bert : public llm_graph_context {
                 cur = build_lora_mm(model.layers[il].wqkv, cur);
                 cb(cur, "wqkv", il);
 
+                if (model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
+                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                    cb(cur, "bqkv", il);
+                }
+
                 Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd,     n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
                 Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
                 Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
@@ -5782,13 +5801,29 @@ struct llm_build_bert : public llm_graph_context {
             cb(ffn_inp, "ffn_inp", il);
 
             // feed-forward network
-            if (model.arch == LLM_ARCH_BERT) {
+            if (hparams.moe_every_n_layers > 0 && il % hparams.moe_every_n_layers == 1) {
+                // MoE branch
+                cur = build_moe_ffn(cur,
+                        model.layers[il].ffn_gate_inp,
+                        model.layers[il].ffn_up_exps,
+                        nullptr,
+                        model.layers[il].ffn_down_exps,
+                        nullptr,
+                        hparams.n_expert,
+                        hparams.n_expert_used,
+                        LLM_FFN_GELU,
+                        false, false,
+                        0.0f,
+                        LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, il);
+                cb(cur, "ffn_moe_out", il);
+            } else if (model.arch == LLM_ARCH_BERT || model.arch == LLM_ARCH_NOMIC_BERT_MOE) {
                 cur = build_ffn(cur,
                         model.layers[il].ffn_up,   model.layers[il].ffn_up_b,   NULL,
                         NULL,                      NULL,                        NULL,
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_SEQ, il);
+                cb(cur, "ffn_out", il);
             } else if (model.arch == LLM_ARCH_JINA_BERT_V2) {
                 cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL,                        NULL,
@@ -5796,6 +5831,7 @@ struct llm_build_bert : public llm_graph_context {
                         model.layers[il].ffn_down, model.layers[il].ffn_down_b, NULL,
                         NULL,
                         LLM_FFN_GELU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
             } else {
                 cur = build_ffn(cur,
                         model.layers[il].ffn_up,   NULL, NULL,
@@ -5803,8 +5839,8 @@ struct llm_build_bert : public llm_graph_context {
                         model.layers[il].ffn_down, NULL, NULL,
                         NULL,
                         LLM_FFN_SILU, LLM_FFN_PAR, il);
+                cb(cur, "ffn_out", il);
             }
-            cb(cur, "ffn_out", il);
 
             // attentions bypass the intermediate layer
             cur = ggml_add(ctx0, cur, ffn_inp);
@@ -12843,6 +12879,7 @@ llm_graph_result_ptr llama_model::build_graph(
         case LLM_ARCH_BERT:
         case LLM_ARCH_JINA_BERT_V2:
         case LLM_ARCH_NOMIC_BERT:
+        case LLM_ARCH_NOMIC_BERT_MOE:
             {
                 llm = std::make_unique<llm_build_bert>(*this, params, gf);
             } break;
@@ -13201,6 +13238,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_DBRX:
         case LLM_ARCH_BERT:
         case LLM_ARCH_NOMIC_BERT:
+        case LLM_ARCH_NOMIC_BERT_MOE:
         case LLM_ARCH_STABLELM:
         case LLM_ARCH_BITNET:
         case LLM_ARCH_QWEN:

From b6ce7430b7eb51f032152316880204e0a9c0470e Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Tue, 29 Apr 2025 08:45:49 +0200
Subject: [PATCH 058/200] llama-graph : fix text position for mrope (#13159)

* llama-graph : fix text position for mrope

* fix typo

* explicitly set 4th dim in the loop
---
 src/llama-graph.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 2706ea2635444..fabb9ca237653 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -55,13 +55,16 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
     if (ubatch->pos && pos) {
         const int64_t n_tokens = ubatch->n_tokens;
 
-        if (ubatch->token && n_pos_per_embd > 1) {
+        if (ubatch->token && n_pos_per_embd == 4) {
             // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
-            // the other dimensions are all 0, they are unused for text tokens
-            std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd, 0);
+            // the 3 first dims are the same, and 4th dim is all 0
+            std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
             // copy the first dimension
             for (int i = 0; i < n_tokens; ++i) {
-                pos_data[i] = ubatch->pos[i];
+                pos_data[               i] = ubatch->pos[i];
+                pos_data[    n_tokens + i] = ubatch->pos[i];
+                pos_data[2 * n_tokens + i] = ubatch->pos[i];
+                pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
             }
             ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
         } else {

From e98b3692be4cd8fbbd9a56fbacc2f2bf0bf26a68 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Tue, 29 Apr 2025 11:00:31 +0200
Subject: [PATCH 059/200] llama : set qwen3 model type sizes (#13175)

---
 src/llama-model.cpp | 10 ++++++++++
 src/llama-model.h   |  4 ++++
 2 files changed, 14 insertions(+)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 2ec55d55a37be..2e0eb036e060f 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -92,6 +92,10 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_290B:          return "290B";
         case LLM_TYPE_17B_16E:       return "17Bx16E (Scout)";
         case LLM_TYPE_17B_128E:      return "17Bx128E (Maverick)";
+        case LLM_TYPE_0_6B:          return "0.6B";
+        case LLM_TYPE_1_7B:          return "1.7B";
+        case LLM_TYPE_30B_A3B:       return "30B.A3B";
+        case LLM_TYPE_235B_A22B:     return "235B.A22B";
         default:                     return "?B";
     }
 }
@@ -793,6 +797,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
+                    case 28: type = hparams.n_embd == 1024 ? LLM_TYPE_0_6B : LLM_TYPE_1_7B; break;
+                    case 36: type = hparams.n_embd == 2560 ? LLM_TYPE_4B : LLM_TYPE_8B; break;
+                    case 40: type = LLM_TYPE_14B; break;
+                    case 64: type = LLM_TYPE_32B; break;
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
@@ -802,6 +810,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
 
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
+                    case 48: type = LLM_TYPE_30B_A3B; break;
+                    case 94: type = LLM_TYPE_235B_A22B; break;
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
diff --git a/src/llama-model.h b/src/llama-model.h
index fd82d106ccda8..167632e186b70 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -88,6 +88,10 @@ enum llm_type {
     LLM_TYPE_290B,
     LLM_TYPE_17B_16E, // llama4 Scout
     LLM_TYPE_17B_128E, // llama4 Maverick
+    LLM_TYPE_0_6B,
+    LLM_TYPE_1_7B,
+    LLM_TYPE_30B_A3B,
+    LLM_TYPE_235B_A22B,
 };
 
 struct llama_layer_posnet {

From 00e3e5a194e88e604e7c91391b9e90332888fd72 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <son@huggingface.co>
Date: Tue, 29 Apr 2025 11:47:04 +0200
Subject: [PATCH 060/200] mtmd : add qwen2vl and qwen2.5vl (#13141)

* llava : add clip_n_output_tokens, deprecate clip_n_patches

* mtmd : add qwen2vl and qwen2.5vl

* decode_embd_batch::set_position_...

* working version

* deprecate llama-qwen2vl-cli

* correct order W, H of clip_embd_nbytes_by_img

* edit existing line in hot topics
---
 README.md                                     |   2 +-
 examples/llava/CMakeLists.txt                 |   8 +-
 examples/llava/clip.cpp                       |  34 +++-
 examples/llava/clip.h                         |  19 ++-
 examples/llava/llava.cpp                      |  15 +-
 examples/llava/mtmd-cli.cpp                   |  36 +----
 examples/llava/mtmd.cpp                       | 146 +++++++++++++++---
 examples/llava/mtmd.h                         |   9 +-
 .../{qwen2vl-cli.cpp => qwen2vl-test.cpp}     |   2 +
 examples/llava/tests.sh                       |   4 +-
 10 files changed, 196 insertions(+), 79 deletions(-)
 rename examples/llava/{qwen2vl-cli.cpp => qwen2vl-test.cpp} (99%)

diff --git a/README.md b/README.md
index 1785493c3e2b0..42c0eb633ef5d 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 ## Hot topics
 
 - **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
-- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated
+- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
 - VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
 - Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
 - Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim
diff --git a/examples/llava/CMakeLists.txt b/examples/llava/CMakeLists.txt
index 6409b4f5e6cd0..27b6d27e5cac3 100644
--- a/examples/llava/CMakeLists.txt
+++ b/examples/llava/CMakeLists.txt
@@ -64,13 +64,7 @@ endif()
 add_executable(llama-llava-cli    deprecation-warning.cpp)
 add_executable(llama-gemma3-cli   deprecation-warning.cpp)
 add_executable(llama-minicpmv-cli deprecation-warning.cpp)
-
-set(TARGET llama-qwen2vl-cli)
-add_executable(${TARGET} qwen2vl-cli.cpp)
-set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
-install(TARGETS ${TARGET} RUNTIME)
-target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
-target_compile_features(${TARGET} PRIVATE cxx_std_17)
+add_executable(llama-qwen2vl-cli  deprecation-warning.cpp)
 
 set(TARGET llama-mtmd-cli)
 add_executable(${TARGET} mtmd-cli.cpp)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index a5eb55f4d412d..ad3e7df1d8a3a 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2825,15 +2825,18 @@ void clip_free(clip_ctx * ctx) {
     delete ctx;
 }
 
+// deprecated
 size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
-    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
+    const int32_t nx = ctx->vision_model.hparams.image_size;
+    const int32_t ny = ctx->vision_model.hparams.image_size;
+    return clip_embd_nbytes_by_img(ctx, nx, ny);
 }
 
-size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
+size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
     clip_image_f32 img;
     img.nx = img_w;
     img.ny = img_h;
-    return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
+    return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
 }
 
 int32_t clip_get_image_size(const struct clip_ctx * ctx) {
@@ -2863,14 +2866,37 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
     return ctx->vision_model.hparams.image_grid_pinpoints.size();
 }
 
+// deprecated
 int clip_n_patches(const struct clip_ctx * ctx) {
     clip_image_f32 img;
     img.nx = ctx->vision_model.hparams.image_size;
     img.ny = ctx->vision_model.hparams.image_size;
-    return clip_n_patches_by_img(ctx, &img);
+    return clip_n_output_tokens(ctx, &img);
 }
 
+// deprecated
 int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+    return clip_n_output_tokens(ctx, img);
+}
+
+int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+    const auto & params = ctx->vision_model.hparams;
+    const int n_total = clip_n_output_tokens(ctx, img);
+    if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
+        return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
+    }
+    return n_total;
+}
+
+int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
+    const auto & params = ctx->vision_model.hparams;
+    if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
+        return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
+    }
+    return 1;
+}
+
+int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
     const auto & params = ctx->vision_model.hparams;
 
     int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
diff --git a/examples/llava/clip.h b/examples/llava/clip.h
index 6ba42ad892146..0a53bd8eb78e1 100644
--- a/examples/llava/clip.h
+++ b/examples/llava/clip.h
@@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
 CLIP_API void clip_free(struct clip_ctx * ctx);
 
 CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
-CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
+CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
 
 CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
 CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
@@ -59,9 +59,20 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
 CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
 CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
 
-CLIP_API int clip_n_patches        (const struct clip_ctx * ctx);
-CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
-CLIP_API int clip_n_mmproj_embd    (const struct clip_ctx * ctx);
+GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
+    "use clip_n_output_tokens instead");
+GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
+    "use clip_n_output_tokens instead");
+
+CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+
+// for M-RoPE, this will be the number of token positions in X and Y directions
+// for other models, X will be the total number of tokens and Y will be 1
+CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
+
+// this should be equal to the embedding dimension of the text model
+CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
 
 CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
 CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp
index 03a22cbb4c205..c00d16aefff10 100644
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
 }
 
 // Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
-static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
+static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
     struct {
         struct ggml_context * ctx;
     } model;
@@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
 
     model.ctx = ggml_init(params);
 
-    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
+    struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
     // ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
     // fill it with the image embeddings, ignoring the base
     for (size_t i = 1; i < num_images; i++) {
@@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
 
     memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
     // append without newline tokens (default behavior in llava_arch when not using unpad ):
-    memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
-    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
+    memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
+    *n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
 
     // Debug: Test single segments
     // Current findings: sending base image, sending a segment embedding all works similar to python
@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
                 image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
                 image_embd_v[i],
                 clip_embd_nbytes_by_img(ctx_clip, nx, ny));
-            n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
+            n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
         }
         *n_img_pos = n_img_pos_out;
         for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
     }
     else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
         // flat / default llava-1.5 type embedding
-        *n_img_pos = clip_n_patches(ctx_clip);
         clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
+        *n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
         bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
         if (!encoded) {
             LOG_ERR("Unable to encode image\n");
@@ -381,7 +381,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
         struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
 
         int n_img_pos_out;
-        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
+        clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
+        clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
         *n_img_pos = n_img_pos_out;
 
         for (size_t i = 0; i < image_embd_v.size(); i++) {
diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp
index 250e8c9a9e871..4d857ca64e0b4 100644
--- a/examples/llava/mtmd-cli.cpp
+++ b/examples/llava/mtmd-cli.cpp
@@ -136,39 +136,6 @@ struct mtmd_cli_context {
     }
 };
 
-struct decode_embd_batch {
-    std::vector<llama_pos>      pos;
-    std::vector<int32_t>        n_seq_id;
-    std::vector<llama_seq_id>   seq_id_0;
-    std::vector<llama_seq_id *> seq_ids;
-    std::vector<int8_t>         logits;
-    llama_batch batch;
-    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-        pos     .resize(n_tokens);
-        n_seq_id.resize(n_tokens);
-        seq_ids .resize(n_tokens + 1);
-        logits  .resize(n_tokens);
-        seq_id_0.resize(1);
-        seq_id_0[0] = seq_id;
-        seq_ids [n_tokens] = nullptr;
-        batch = {
-            /*n_tokens       =*/ n_tokens,
-            /*tokens         =*/ nullptr,
-            /*embd           =*/ embd,
-            /*pos            =*/ pos.data(),
-            /*n_seq_id       =*/ n_seq_id.data(),
-            /*seq_id         =*/ seq_ids.data(),
-            /*logits         =*/ logits.data(),
-        };
-        for (int i = 0; i < n_tokens; i++) {
-            batch.pos     [i] = pos_0 + i;
-            batch.n_seq_id[i] = 1;
-            batch.seq_id  [i] = seq_id_0.data();
-            batch.logits  [i] = false;
-        }
-    }
-};
-
 static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
     llama_tokens generated_tokens;
     for (int i = 0; i < n_predict; i++) {
@@ -243,7 +210,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
         return 1;
     }
 
-    ctx.n_past += mtmd_helper_get_n_tokens(chunks);
+    ctx.n_past += mtmd_helper_get_n_pos(chunks);
 
     return 0;
 }
@@ -371,6 +338,7 @@ int main(int argc, char ** argv) {
         }
     }
     if (g_is_interrupted) LOG("\nInterrupted by user\n");
+    LOG("\n\n");
     llama_perf_context_print(ctx.lctx);
     return g_is_interrupted ? 130 : 0;
 }
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp
index f95f0503569f9..7081fd7352bb7 100644
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -40,11 +40,14 @@ struct mtmd_context {
     llama_token tok_sli_img_end   = LLAMA_TOKEN_NULL; // single slice
     llama_token tok_row_end       = LLAMA_TOKEN_NULL; // end of row
 
+    bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
+
     // TODO @ngxson : add timings
 
     mtmd_context(const char * mmproj_fname,
                    const llama_model * text_model,
                    const mtmd_context_params & ctx_params) :
+        text_model   (text_model),
         print_timings(ctx_params.print_timings),
         n_threads    (ctx_params.n_threads),
         image_marker (ctx_params.image_marker)
@@ -56,9 +59,8 @@ struct mtmd_context {
         if (!ctx_clip) {
             throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
         }
-        this->text_model = text_model;
 
-        GGML_ASSERT(!clip_is_qwen2vl(ctx_clip) && "Qwen2VL model is not supported yet, use llama-qwen2vl-cli instead");
+        use_mrope = clip_is_qwen2vl(ctx_clip);
 
         int minicpmv_version = clip_is_minicpmv(ctx_clip);
         if (minicpmv_version == 2) {
@@ -126,6 +128,7 @@ struct mtmd_image_tokens_data {
 struct mtmd_image_tokens {
     uint32_t nx; // number of tokens in x direction
     uint32_t ny; // number of tokens in y direction
+    bool use_mrope_pos = false; // use M-RoPE position counting (the whole image is 1 temporal position)
     uint32_t n_tokens() const { return nx * ny; }
     clip_image_f32_batch batch_f32; // preprocessed image patches
     std::string id; // optional user-defined ID, useful for KV cache tracking
@@ -202,6 +205,13 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
         string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
     }
 
+    else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
+        // <|vision_start|> ... (image embeddings) ... <|vision_end|>
+        marker_modified = "<|vision_start|>" + ctx->image_marker + "<|vision_end|>";
+        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+
+    }
+
     // llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
 
     std::vector<std::string> parts = string_split_str(prompt_modified, ctx->image_marker);
@@ -226,7 +236,7 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
 
         for (auto & entry : batch_f32.entries) {
             mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-            image_tokens->nx = clip_n_patches_by_img(ctx->ctx_clip, entry.get());
+            image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
             image_tokens->ny = 1;
             image_tokens->batch_f32.entries.push_back(std::move(entry));
             image_tokens->id = id;
@@ -322,12 +332,20 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
             } else {
                 size_t n_tokens = 0;
                 for (const auto & entry : batch_f32.entries) {
-                    n_tokens += clip_n_patches_by_img(ctx->ctx_clip, entry.get());
+                    n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
                 }
 
                 mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-                image_tokens->nx = n_tokens;
-                image_tokens->ny = 1; // TODO
+                if (ctx->use_mrope) {
+                    // for Qwen2VL, we need this information for M-RoPE decoding positions
+                    image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
+                    image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
+                    image_tokens->use_mrope_pos = true;
+                } else {
+                    // other models, we only need the total number of tokens
+                    image_tokens->nx = n_tokens;
+                    image_tokens->ny = 1;
+                }
                 image_tokens->batch_f32 = std::move(batch_f32);
                 image_tokens->id = bitmaps[i_img].id; // optional
 
@@ -372,6 +390,13 @@ std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens) {
     return image_tokens->id;
 }
 
+llama_pos mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens) {
+    if (image_tokens->use_mrope_pos) {
+        return 1; // for M-RoPE, the whole image is 1 in temporal dimension
+    }
+    return image_tokens->n_tokens();
+}
+
 int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
     ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
@@ -389,7 +414,7 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
         // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
         const auto & entries = image_tokens->batch_f32.entries;
         for (size_t i = 0; i < entries.size(); i++) {
-            int n_tokens_per_image = clip_n_patches_by_img(ctx->ctx_clip, entries[i].get());
+            int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
             ok = clip_image_encode(
                 ctx->ctx_clip,
                 ctx->n_threads,
@@ -417,7 +442,7 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
         if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
             n_tokens += chunk.tokens_text.size();
         } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
-            n_tokens += chunk.tokens_image->n_tokens();
+            n_tokens += mtmd_image_tokens_get_n_tokens(chunk.tokens_image.get());
         } else {
             GGML_ASSERT(false && "chunk type not supported");
         }
@@ -425,22 +450,38 @@ size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks) {
     return n_tokens;
 }
 
+llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks) {
+    llama_pos n_pos = 0;
+    for (auto & chunk : chunks) {
+        if (chunk.type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
+            n_pos += chunk.tokens_text.size();
+        } else if (chunk.type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
+            n_pos += mtmd_image_tokens_get_n_pos(chunk.tokens_image.get());
+        } else {
+            GGML_ASSERT(false && "chunk type not supported");
+        }
+    }
+    return n_pos;
+}
+
 // helper struct to make working with embd batch easier
 // note: this will be removed after llama_batch_ext refactoring
 struct decode_embd_batch {
+    int n_pos_per_embd;
+    int n_mmproj_embd;
     std::vector<llama_pos>      pos;
+    std::vector<llama_pos>      pos_view; // used by mrope
     std::vector<int32_t>        n_seq_id;
     std::vector<llama_seq_id>   seq_id_0;
     std::vector<llama_seq_id *> seq_ids;
     std::vector<int8_t>         logits;
     llama_batch batch;
-    decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
-        pos     .resize(n_tokens);
+    decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+        pos     .resize(n_tokens * n_pos_per_embd);
         n_seq_id.resize(n_tokens);
         seq_ids .resize(n_tokens + 1);
         logits  .resize(n_tokens);
         seq_id_0.resize(1);
-        seq_id_0[0] = seq_id;
         seq_ids [n_tokens] = nullptr;
         batch = {
             /*n_tokens       =*/ n_tokens,
@@ -451,13 +492,64 @@ struct decode_embd_batch {
             /*seq_id         =*/ seq_ids.data(),
             /*logits         =*/ logits.data(),
         };
-        for (int i = 0; i < n_tokens; i++) {
+    }
+
+    void set_position_normal(llama_pos pos_0, llama_seq_id seq_id) {
+        seq_id_0[0] = seq_id;
+        for (int i = 0; i < batch.n_tokens; i++) {
             batch.pos     [i] = pos_0 + i;
             batch.n_seq_id[i] = 1;
             batch.seq_id  [i] = seq_id_0.data();
             batch.logits  [i] = false;
         }
     }
+
+    void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
+        GGML_ASSERT(n_pos_per_embd == 4);
+        seq_id_0[0] = seq_id;
+        for (int y = 0; y < ny; y++) {
+            for (int x = 0; x < nx; x++) {
+                int i = y * nx + x;
+                pos[i                     ] = pos_0;
+                pos[i + batch.n_tokens    ] = pos_0 + y;
+                pos[i + batch.n_tokens * 2] = pos_0 + x;
+                pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
+            }
+        }
+        for (int i = 0; i < batch.n_tokens; i++) {
+            batch.n_seq_id[i] = 1;
+            batch.seq_id  [i] = seq_id_0.data();
+            batch.logits  [i] = false;
+        }
+    }
+
+    llama_batch get_view(int offset, int n_tokens) {
+        llama_pos * pos_ptr;
+        pos_view.clear();
+        pos_view.resize(n_tokens * n_pos_per_embd);
+        if (n_pos_per_embd > 1) {
+            // mrope
+            // for example, with layout of src: 1234...1234...1234...1234...
+            //       offset 2 will give us dst: 34...34...34...34...
+            for (int i = 0; i < n_pos_per_embd; i++) {
+                auto src = pos.begin() + i * batch.n_tokens + offset;
+                pos_view.insert(pos_view.end(), src, src + n_tokens);
+            }
+            pos_ptr = pos_view.data();
+        } else {
+            // normal
+            pos_ptr = pos.data() + offset;
+        }
+        return {
+            /*n_tokens       =*/ n_tokens,
+            /*tokens         =*/ nullptr,
+            /*embd           =*/ batch.embd     + offset * n_mmproj_embd,
+            /*pos            =*/ pos_ptr,
+            /*n_seq_id       =*/ batch.n_seq_id + offset,
+            /*seq_id         =*/ batch.seq_id   + offset,
+            /*logits         =*/ batch.logits   + offset,
+        };
+    }
 };
 
 int32_t mtmd_helper_eval(mtmd_context * ctx,
@@ -470,6 +562,7 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
     llama_pos n_past = pos0;
     llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
     int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
+    int n_pos_per_embd = mtmd_decode_use_mrope(ctx) ? 4 : 1;
 
     for (auto & chunk : chunks) {
         bool is_last = &chunk == &chunks.back();
@@ -517,6 +610,16 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
             int32_t i_batch = 0;
             int32_t n_img_batches = GGML_PAD(n_tokens, n_batch) / n_batch;
             float * embd = mtmd_get_output_embd(ctx);
+            decode_embd_batch batch_embd(embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
+
+            const int nx = mtmd_image_tokens_get_nx(chunk.tokens_image.get());
+            const int ny = mtmd_image_tokens_get_ny(chunk.tokens_image.get());
+
+            if (mtmd_decode_use_mrope(ctx)) {
+                batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
+            } else {
+                batch_embd.set_position_normal(n_past, seq_id);
+            }
 
             if (mtmd_decode_use_non_causal(ctx)) {
                 llama_set_causal_attn(lctx, false);
@@ -524,15 +627,14 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
             }
 
             while (i_batch < n_img_batches) { // split into batches
-                int32_t pos_offset = i_batch*n_batch;
-                int32_t n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
-                float * embd_batch = embd + pos_offset*n_mmproj_embd;
-                decode_embd_batch batch_img(embd_batch, n_tokens_batch, n_past, 0);
+                int pos_offset = i_batch*n_batch;
+                int n_tokens_batch = std::min(n_batch, n_tokens - pos_offset);
+                llama_batch batch_embd_view = batch_embd.get_view(pos_offset, n_tokens_batch);
 
-                printf("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
+                LOG_INF("decoding image batch %d/%d, n_tokens_batch = %d\n", i_batch+1, n_img_batches, n_tokens_batch);
 
                 int64_t t1 = ggml_time_ms();
-                ret = llama_decode(lctx, batch_img.batch);
+                ret = llama_decode(lctx, batch_embd_view);
                 if (ret != 0) {
                     LOG_ERR("failed to decode image\n");
                     llama_set_causal_attn(lctx, true); // restore causal attn
@@ -545,9 +647,11 @@ int32_t mtmd_helper_eval(mtmd_context * ctx,
                 }
 
                 i_batch++;
-                n_past += n_tokens_batch;
             }
 
+            // for mrope, one image is one single **temporal** position
+            n_past += mtmd_decode_use_mrope(ctx) ? 1 : n_tokens;
+
             if (mtmd_decode_use_non_causal(ctx)) {
                 llama_set_causal_attn(lctx, true);
             }
@@ -595,6 +699,10 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
     return false;
 }
 
+bool mtmd_decode_use_mrope(mtmd_context * ctx) {
+    return ctx->use_mrope;
+}
+
 void mtmd_image_tokens_deleter::operator()(mtmd_image_tokens * val) {
     mtmd_image_tokens_free(val);
 }
diff --git a/examples/llava/mtmd.h b/examples/llava/mtmd.h
index 78be192dd6eb6..6805e5e4816c3 100644
--- a/examples/llava/mtmd.h
+++ b/examples/llava/mtmd.h
@@ -102,6 +102,7 @@ MTMD_API size_t      mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * im
 MTMD_API size_t      mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens);
 MTMD_API size_t      mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens);
 MTMD_API std::string mtmd_image_tokens_get_id(const mtmd_image_tokens * image_tokens);
+MTMD_API llama_pos   mtmd_image_tokens_get_n_pos(const mtmd_image_tokens * image_tokens); // number of temporal positions (always 1 for M-RoPE, n_tokens otherwise)
 MTMD_API void        mtmd_image_tokens_free(mtmd_image_tokens * image_tokens);
 
 // returns 0 on success
@@ -114,15 +115,21 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 // whether we need to set non-causal mask before llama_decode
 MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
 
+// whether the current model use M-RoPE for llama_decode
+MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
+
 
 
 //
 // helper functions (can be implemented based on other functions)
 //
 
-// helper to count the total number of tokens from a list of chunks, useful to keep track of n_past
+// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
 MTMD_API size_t mtmd_helper_get_n_tokens(mtmd_input_chunks & chunks);
 
+// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
+MTMD_API llama_pos mtmd_helper_get_n_pos(mtmd_input_chunks & chunks);
+
 // helper function that automatically:
 // 1. run llama_decode() on text chunks
 // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-test.cpp
similarity index 99%
rename from examples/llava/qwen2vl-cli.cpp
rename to examples/llava/qwen2vl-test.cpp
index 1e54851ea07a0..7f9e3dca885c6 100644
--- a/examples/llava/qwen2vl-cli.cpp
+++ b/examples/llava/qwen2vl-test.cpp
@@ -27,6 +27,8 @@
 #include <cassert>
 #include <cmath>
 
+// THIS FILE IS ONLY USED FOR TESTING THE QWEN2VL MODEL
+// IT IS NOT A PRODUCTION CODE
 
 static bool qwen2vl_eval_image_embed(llama_context * ctx_llama, const struct llava_image_embed * image_embed,
                                      int n_batch, int * n_past, int * st_pos_id, struct clip_image_size * image_size) {
diff --git a/examples/llava/tests.sh b/examples/llava/tests.sh
index 4002f9d531bd2..75604315cfeba 100755
--- a/examples/llava/tests.sh
+++ b/examples/llava/tests.sh
@@ -54,8 +54,8 @@ add_test "llama-mtmd-cli"  "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
 add_test "llama-mtmd-cli"  "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K"  # model from openbmb is corrupted
 add_test "llama-mtmd-cli"  "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
 add_test "llama-mtmd-cli"  "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
-add_test "llama-qwen2vl-cli"  "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
-add_test "llama-qwen2vl-cli"  "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
+add_test "llama-mtmd-cli"  "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
+add_test "llama-mtmd-cli"  "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
 
 # to test the big models, run: ./tests.sh big
 add_test_big "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"

From 7d3af70b089bb349b5d17eb01839224c99ec1952 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= <sigbjorn.skjaeret@scala.com>
Date: Tue, 29 Apr 2025 13:25:53 +0200
Subject: [PATCH 061/200] llama : llm_type order by size (#13177)

---
 src/llama-model.cpp | 8 ++++----
 src/llama-model.h   | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 2e0eb036e060f..759669c178d3b 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -43,11 +43,13 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_770M:          return "770M";
         case LLM_TYPE_780M:          return "780M";
         case LLM_TYPE_0_5B:          return "0.5B";
+        case LLM_TYPE_0_6B:          return "0.6B";
         case LLM_TYPE_1B:            return "1B";
         case LLM_TYPE_1_3B:          return "1.3B";
         case LLM_TYPE_1_4B:          return "1.4B";
         case LLM_TYPE_1_5B:          return "1.5B";
         case LLM_TYPE_1_6B:          return "1.6B";
+        case LLM_TYPE_1_7B:          return "1.7B";
         case LLM_TYPE_1_8B:          return "1.8B";
         case LLM_TYPE_2B:            return "2B";
         case LLM_TYPE_2_8B:          return "2.8B";
@@ -66,6 +68,7 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_15B:           return "15B";
         case LLM_TYPE_16B:           return "16B";
         case LLM_TYPE_20B:           return "20B";
+        case LLM_TYPE_27B:           return "27B";
         case LLM_TYPE_30B:           return "30B";
         case LLM_TYPE_32B:           return "32B";
         case LLM_TYPE_34B:           return "34B";
@@ -74,6 +77,7 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_65B:           return "65B";
         case LLM_TYPE_70B:           return "70B";
         case LLM_TYPE_236B:          return "236B";
+        case LLM_TYPE_290B:          return "290B";
         case LLM_TYPE_314B:          return "314B";
         case LLM_TYPE_671B:          return "671B";
         case LLM_TYPE_SMALL:         return "0.1B";
@@ -88,12 +92,8 @@ const char * llm_type_name(llm_type type) {
         case LLM_TYPE_16x3_8B:       return "16x3.8B";
         case LLM_TYPE_10B_128x3_66B: return "10B+128x3.66B";
         case LLM_TYPE_57B_A14B:      return "57B.A14B";
-        case LLM_TYPE_27B:           return "27B";
-        case LLM_TYPE_290B:          return "290B";
         case LLM_TYPE_17B_16E:       return "17Bx16E (Scout)";
         case LLM_TYPE_17B_128E:      return "17Bx128E (Maverick)";
-        case LLM_TYPE_0_6B:          return "0.6B";
-        case LLM_TYPE_1_7B:          return "1.7B";
         case LLM_TYPE_30B_A3B:       return "30B.A3B";
         case LLM_TYPE_235B_A22B:     return "235B.A22B";
         default:                     return "?B";
diff --git a/src/llama-model.h b/src/llama-model.h
index 167632e186b70..95eca00266a4b 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -39,11 +39,13 @@ enum llm_type {
     LLM_TYPE_770M,
     LLM_TYPE_780M,
     LLM_TYPE_0_5B,
+    LLM_TYPE_0_6B,
     LLM_TYPE_1B,
     LLM_TYPE_1_3B,
     LLM_TYPE_1_4B,
     LLM_TYPE_1_5B,
     LLM_TYPE_1_6B,
+    LLM_TYPE_1_7B,
     LLM_TYPE_1_8B,
     LLM_TYPE_2B,
     LLM_TYPE_2_8B,
@@ -62,6 +64,7 @@ enum llm_type {
     LLM_TYPE_15B,
     LLM_TYPE_16B,
     LLM_TYPE_20B,
+    LLM_TYPE_27B,
     LLM_TYPE_30B,
     LLM_TYPE_32B,
     LLM_TYPE_34B,
@@ -70,6 +73,7 @@ enum llm_type {
     LLM_TYPE_65B,
     LLM_TYPE_70B,
     LLM_TYPE_236B,
+    LLM_TYPE_290B,
     LLM_TYPE_314B,
     LLM_TYPE_671B,
     LLM_TYPE_SMALL,
@@ -84,12 +88,8 @@ enum llm_type {
     LLM_TYPE_16x3_8B,
     LLM_TYPE_10B_128x3_66B,
     LLM_TYPE_57B_A14B,
-    LLM_TYPE_27B,
-    LLM_TYPE_290B,
     LLM_TYPE_17B_16E, // llama4 Scout
     LLM_TYPE_17B_128E, // llama4 Maverick
-    LLM_TYPE_0_6B,
-    LLM_TYPE_1_7B,
     LLM_TYPE_30B_A3B,
     LLM_TYPE_235B_A22B,
 };

From b67462cb98a2938d4dd60f63a3062772ccf4aa02 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Fri, 14 Feb 2025 21:50:53 +0800
Subject: [PATCH 062/200] ggml-qnn: add Qualcomm QNN backend for GGML

---
 ggml/CMakeLists.txt              |    2 +
 ggml/include/ggml-qnn.h          |   68 +
 ggml/src/CMakeLists.txt          |    1 +
 ggml/src/ggml-backend-reg.cpp    |    8 +
 ggml/src/ggml-qnn/CMakeLists.txt |   33 +
 ggml/src/ggml-qnn/ggml-qnn.cpp   | 3932 ++++++++++++++++++++++++++++++
 scripts/build-run-android.sh     |  202 ++
 7 files changed, 4246 insertions(+)
 create mode 100644 ggml/include/ggml-qnn.h
 create mode 100644 ggml/src/ggml-qnn/CMakeLists.txt
 create mode 100644 ggml/src/ggml-qnn/ggml-qnn.cpp
 create mode 100755 scripts/build-run-android.sh

diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index 61fe15a15f074..b12a4fa47c420 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -204,6 +204,7 @@ option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"
 option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
 set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
                                             "gmml: OpenCL API version to target")
+option(GGML_QNN                             "ggml: use QNN"                                   OFF)
 
 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
@@ -269,6 +270,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-rpc.h
     include/ggml-sycl.h
     include/ggml-vulkan.h
+    include/ggml-qnn.h
     include/gguf.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h
new file mode 100644
index 0000000000000..06f143546ad24
--- /dev/null
+++ b/ggml/include/ggml-qnn.h
@@ -0,0 +1,68 @@
+ /*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define GGML_QNN_MAX_DEVICES    3
+#define GGML_QNN_BACKEND_NAME   "qnn"
+
+enum QNNBackend {
+    QNN_BACKEND_CPU,
+    QNN_BACKEND_GPU,
+    QNN_BACKEND_NPU,
+    QNN_BACKEND_GGML, //"fake" QNN backend for compare performance between QNN backend and cpu backend
+};
+
+GGML_BACKEND_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char * qnn_lib_path);
+
+GGML_BACKEND_API bool           ggml_backend_is_qnn(ggml_backend_t backend);
+
+GGML_BACKEND_API void           ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int thread_counts);
+
+GGML_BACKEND_API int            ggml_backend_qnn_get_device_count(void);
+
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_qnn_reg(void);
+
+inline const char * ggml_backend_qnn_get_devname(size_t dev_num) {
+    switch (dev_num) {
+        case QNN_BACKEND_CPU:
+            return "QNN-CPU";
+        case QNN_BACKEND_GPU:
+            return "QNN-GPU";
+        case QNN_BACKEND_NPU:
+            return "QNN-NPU";
+        case QNN_BACKEND_GGML:
+            return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML
+        default:
+            return "unknown";
+    }
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 43d9fc4fe25e0..8e8cb81bda0a7 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -313,6 +313,7 @@ ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
 ggml_add_backend(OpenCL)
+ggml_add_backend(QNN)
 
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 405d8e31514b5..9030de3cfeef9 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -65,6 +65,10 @@
 #include "ggml-kompute.h"
 #endif
 
+#ifdef GGML_USE_QNN
+#include "ggml-qnn.h"
+#endif
+
 // disable C++17 deprecation warning for std::codecvt_utf8
 #if defined(__clang__)
 #    pragma clang diagnostic push
@@ -187,6 +191,9 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_KOMPUTE
         register_backend(ggml_backend_kompute_reg());
 #endif
+#ifdef GGML_USE_QNN
+        register_backend(ggml_backend_qnn_reg());
+#endif
 #ifdef GGML_USE_CPU
         register_backend(ggml_backend_cpu_reg());
 #endif
@@ -577,6 +584,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     ggml_backend_load_best("vulkan", silent, dir_path);
     ggml_backend_load_best("opencl", silent, dir_path);
     ggml_backend_load_best("musa", silent, dir_path);
+    ggml_backend_load_best("qnn", silent, dir_path);
     ggml_backend_load_best("cpu", silent, dir_path);
     // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
     const char * backend_path = std::getenv("GGML_BACKEND_PATH");
diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt
new file mode 100644
index 0000000000000..7bbb9be76b4f6
--- /dev/null
+++ b/ggml/src/ggml-qnn/CMakeLists.txt
@@ -0,0 +1,33 @@
+message(STATUS "Using QNN backend")
+
+if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+    find_library(LOG_LIB log)
+    set(QNN_LINK_LIBRARIES ${LOG_LIB})
+    set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
+else()
+    message(FATAL_ERROR "QNN now only available on Android")
+endif()
+
+if(NOT DEFINED GGML_QNN_SDK_PATH)
+    # try read from environment variable
+    if(DEFINED ENV{QNN_SDK_PATH})
+        set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH})
+    else()
+        message(FATAL_ERROR "GGML_QNN_SDK_PATH not defined")
+    endif()
+endif()
+
+message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}")
+
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
+
+file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
+ggml_add_backend_library(ggml-qnn
+    ${QNN_SOURCES}
+)
+
+target_include_directories(ggml-qnn PRIVATE ${GGML_QNN_SDK_PATH}/include/QNN ${CMAKE_CURRENT_LIST_DIR})
+target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES})
+
+string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
+target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/")
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
new file mode 100644
index 0000000000000..d29c6cb6f9222
--- /dev/null
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -0,0 +1,3932 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * this is implementation of ggml-qnn(ggml-qnn backend of Qualcomm QNN(Qualcomm Neural Network,
+ * aka Qualcomm AI Engine Direct)
+ *
+ * Qualcomm QNN SDK and reference tech guides could be found at:
+ * https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+ * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
+ *
+ * the implementation of ggml-qnn backend has six sections:
+ * section-1 does forward/external declaration,
+ * section-2 defines ggml-qnn internal log function
+ * section-3 does general helper macro / data structure / function
+ * section-4 does QNN helper macro / data structure / function
+ * section-5 does ggml-qnn backend helper macro / data structure / function / class
+ * section-6 does implementation of ggml-qnn backend according to ggml's backend subsystem
+ *
+ * currently only provide GGML_OP_ADD's QNN backend implementation:
+ *    - GGML_OP_ADD:  this is skeleton, can expand other ggml ops as expertise
+ *
+ * of course, can porting ggml-qnn to Windows on ARM as need.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include <inttypes.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/sysinfo.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <map>
+#include <set>
+#include <tuple>
+#include <queue>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <chrono>
+#include <memory>
+#include <regex>
+#include <random>
+#include <functional>
+#include <unordered_map>
+#include <condition_variable>
+#include <cassert>
+#include <unordered_set>
+#include <utility>
+#include <stdatomic.h>
+#if (defined __ANDROID__) || (defined ANDROID)
+#include "android/log.h"
+#endif
+
+#include "QnnTypes.h"
+#include "QnnCommon.h"
+#include "QnnContext.h"
+#include "QnnBackend.h"
+#include "QnnGraph.h"
+#include "QnnProperty.h"
+#include "QnnTensor.h"
+#include "QnnInterface.h"
+#include "Saver/QnnSaver.h"
+#include "System/QnnSystemInterface.h"
+#include "HTP/QnnHtpDevice.h"
+#include "HTP/QnnHtpGraph.h"
+
+#include "ggml-qnn.h"
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+
+// =================================================================================================
+//  section-1: forward/external declaration
+// =================================================================================================
+class qnn_instance;
+struct ggml_backend_qnn_context;
+static int free_qnn_tensor(Qnn_Tensor_t * tensor);
+static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+#if (defined __ANDROID__) || (defined ANDROID)
+extern "C" int __android_log_print(int prio, const char * tag, const char * fmt, ...) __attribute__((__format__(printf, 3, 4)));
+#endif
+static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
+
+// =================================================================================================
+//  section-2: ggml-qnn internal troubleshooting function
+// =================================================================================================
+#define GGMLQNN_DEBUG                           1  // for troubleshooting QNN backend
+#define GGML_QNN_LOGBUF_LEN                     4096
+#define ENABLE_QNNBACKEND_PERF                  1  // enable/disable op's perf info
+#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
+#define GGMLQNN_PRINT_OP_ADD_LOG                1  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
+#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            1
+
+#define GGMLQNN_LOG_ERROR(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG,  __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_WARN(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_INFO(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+
+#if GGMLQNN_DEBUG
+#define GGMLQNN_LOG_DEBUG(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+#define GGMLQNN_LOG_DEBUG(...)
+#endif
+static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
+    static std::mutex ggmlqnn_log_internal_mutex;
+    static char s_ggmlqnn_log_internal_buf[GGML_QNN_LOGBUF_LEN];
+
+    {
+        std::lock_guard<std::mutex> lock(ggmlqnn_log_internal_mutex);
+        va_list args;
+        va_start(args, format);
+        int len_prefix = snprintf(s_ggmlqnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line);
+        int len = vsnprintf(s_ggmlqnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args);
+        if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) {
+#if (defined __ANDROID__) || (defined ANDROID)
+            //for Android APK
+            __android_log_print(ANDROID_LOG_INFO, "ggml-qnn", "%s\n", s_ggmlqnn_log_internal_buf);
+#endif
+#if (defined __ANDROID__) || (defined ANDROID)
+            //do nothing when running on Android phone
+#else
+            //for Windows on ARM
+            printf("%s\n", s_ggmlqnn_log_internal_buf);
+#endif
+        }
+        va_end(args);
+    }
+}
+
+// =================================================================================================
+//  section-3: general helper macro / data structure / function
+// =================================================================================================
+#define DISABLE_COPY(class_name)                \
+    class_name(const class_name &) = delete;    \
+    void operator=(const class_name &) = delete
+
+#define DISABLE_MOVE(class_name)                \
+    class_name(class_name &&) = delete;         \
+    void operator=(class_name &&) = delete
+
+#define GGMLQNN_MEM_ADD(alignment)              (sizeof (size_t) + alignment)
+#define GGMLQNN_MEM_MASK(alignment)             ((uintptr_t)alignment - 1)
+
+static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
+    return offset % alignment == 0 ? offset
+                                   : offset +
+                                     (static_cast<intptr_t>(alignment) -
+                                      offset % static_cast<intptr_t>(alignment));
+}
+
+static void * ggmlqnn_mallocz_aligned(size_t size, size_t alignment) {
+    uint8_t * buffer = NULL;
+    size_t * sp = NULL;
+    buffer = static_cast<uint8_t *>(calloc(1, size + GGMLQNN_MEM_ADD(alignment)));
+    if (!buffer)
+        return NULL;
+    sp = (size_t *)buffer;
+    *sp = size;
+    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
+    buffer[-1] = buffer - (uint8_t *)sp;
+    return buffer;
+}
+
+static void * ggmlqnn_malloc_aligned(size_t size, size_t alignment) {
+    uint8_t * buffer = NULL;
+    size_t * sp = NULL;
+    buffer = static_cast<uint8_t *>(malloc(size + GGMLQNN_MEM_ADD(alignment)));
+    if (!buffer)
+        return NULL;
+    sp = (size_t *)buffer;
+    *sp = size;
+    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
+    buffer[-1] = buffer - (uint8_t *)sp;
+    return buffer;
+}
+
+static void ggmqnn_free_aligned(void * ptr) {
+    uint8_t * old = (uint8_t *)ptr;
+    if (!old)
+        return;
+    old -= old[-1];
+    free(old);
+}
+
+static size_t get_system_total_memory_in_bytes() {
+    struct sysinfo info = {};
+    if (sysinfo(&info) == 0) {
+        return (info.totalram + info.totalswap) * info.mem_unit;
+    }
+
+    auto pages = (size_t)sysconf(_SC_PHYS_PAGES);
+    auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
+
+    return pages * page_size;
+}
+
+static size_t get_system_free_memory_in_bytes() {
+    struct sysinfo info = {};
+    if (sysinfo(&info) == 0) {
+        return (info.freeram + info.freeswap) * info.mem_unit;
+    }
+
+    auto avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES);
+    auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
+
+    return avail_pages * page_size;
+}
+
+static size_t ggmlqnn_memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) {
+    if (!dst || !src || !dst_size || !copy_size)
+        return 0;
+
+    size_t min_size = dst_size < copy_size ? dst_size : copy_size;
+
+    memcpy(dst, src, min_size);
+
+    return min_size;
+}
+
+static char * ggmlqnn_strndup(const char * source, size_t maxlen) {
+    return ::strndup(source, maxlen);
+}
+
+static void * ggmlqnn_host_malloc(size_t n) {
+    void * data = NULL;
+    int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
+    if (result != 0) {
+        GGMLQNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__);
+        return NULL;
+    }
+
+    return data;
+}
+
+// =================================================================================================
+//  section-4: QNN helper macro / data structure / function
+// =================================================================================================
+#define VALIDATE(value, status)                         \
+  do {                                                  \
+    status = value;                                     \
+    if (status != QNN_SUCCESS) {                        \
+      GGMLQNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value);       \
+      return status;                                    \
+    }                                                   \
+  } while (0)
+
+#define VALIDATE_TENSOR_VERSION(tensor, err)            VALIDATE(validate_tensor_version(tensor), err)
+
+#define VALIDATE_OP_CONFIG_VERSION(op, err)             VALIDATE(validate_op_config_version(op), err)
+
+#define QNN_VER_PTR(x)                                  (&((x).v1))
+#define QNN_OP_CFG_VALID(op_config)                      ((op_config).version == QNN_OPCONFIG_VERSION_1)
+
+#define QNN_OP_CFG_GET_NAME(op_config)                   get_qnn_oponfig_name(op_config)
+#define QNN_OP_CFG_GET_PACKAGE_NAME(op_config)           get_qnn_op_config_packagename(op_config)
+#define QNN_OP_CFG_GET_TYPE_NAME(op_config)              get_qnn_op_config_typename(op_config)
+#define QNN_OP_CFG_GET_NUM_PARAMS(op_config)             get_qnn_op_config_numparams(op_config)
+#define QNN_OP_CFG_GET_PARAMS(op_config)                 get_qnn_op_config_params(op_config)
+#define QNN_OP_CFG_GET_NUM_INPUTS(op_config)             get_qnn_op_config_numinputs(op_config)
+#define QNN_OP_CFG_GET_INPUTS(op_config)                 get_qnn_op_config_inputs(op_config)
+#define QNN_OP_CFG_GET_NUM_OUTPUTS(op_config)            get_qnn_op_config_numoutputs(op_config)
+#define QNN_OP_CFG_GET_OUTPUTS(op_config)                get_qnn_op_config_outputs(op_config)
+
+#define QNN_OP_CFG_SET_NAME(op_config, value)            set_qnn_op_config_name(op_config, value)
+#define QNN_OP_CFG_SET_PACKAGE_NAME(op_config, value)    set_qnn_op_config_packagename(op_config, value)
+#define QNN_OP_CFG_SET_TYPE_NAME(op_config, value)       set_qnn_op_config_typename(op_config, value)
+
+#define QNN_OP_CFG_SET_PARAMS(op_config, num_of_params, params) \
+  set_qnn_op_config_params(op_config, num_of_params, params)
+
+#define QNN_OP_CFG_SET_INPUTS(op_config, num_of_inputs, inputTensors) \
+  set_qnn_op_config_inputs(op_config, num_of_inputs, inputTensors)
+
+#define QNN_OP_CFG_SET_OUTPUTS(op_config, num_of_outputs, output_tensors) \
+  set_qnn_op_config_outputs(op_config, num_of_outputs, output_tensors)
+
+#define QNN_TENSOR_GET_ID(tensor)                       get_qnn_tensorid(tensor)
+#define QNN_TENSOR_GET_NAME(tensor)                     get_qnn_tensorname(tensor)
+#define QNN_TENSOR_GET_TYPE(tensor)                     get_qnn_tensortype(tensor)
+#define QNN_TENSOR_GET_DATA_FORMAT(tensor)              get_qnn_tensor_dataformat(tensor)
+#define QNN_TENSOR_GET_DATA_TYPE(tensor)                get_qnn_tensor_datatype(tensor)
+#define QNN_TENSOR_GET_QUANT_PARAMS(tensor)             get_qnn_tensor_quantparams(tensor)
+#define QNN_TENSOR_GET_RANK(tensor)                     get_qnn_tensor_rank(tensor)
+#define QNN_TENSOR_GET_DIMENSIONS(tensor)               get_qnn_tensor_dimensions(tensor)
+#define QNN_TENSOR_GET_MEM_TYPE(tensor)                 get_qnn_tensor_memtype(tensor)
+#define QNN_TENSOR_GET_CLIENT_BUF(tensor)               get_qnn_tensor_clientbuf(tensor)
+#define QNN_TENSOR_GET_MEM_HANDLE(tensor)               get_qnn_tensor_memhandle(tensor)
+
+#define QNN_TENSOR_SET_ID(tensor, value)                set_qnn_tensor_id(tensor, value)
+#define QNN_TENSOR_SET_NAME(tensor, value)              set_qnn_tensor_name(tensor, value)
+#define QNN_TENSOR_SET_TYPE(tensor, value)              set_qnn_tensor_type(tensor, value)
+#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value)       set_qnn_tensor_dataformat(tensor, value)
+#define QNN_TENSOR_SET_DATA_TYPE(tensor, value)         set_qnn_tensor_datatype(tensor, value)
+#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value)      set_qnn_tensor_quantparams(tensor, value)
+#define QNN_TENSOR_SET_RANK(tensor, value)              set_qnn_tensor_rank(tensor, value)
+#define QNN_TENSOR_SET_DIMENSIONS(tensor, value)        set_qnn_tensor_dimensions(tensor, value)
+#define QNN_TENSOR_SET_MEM_TYPE(tensor, value)          set_qnn_tensor_memtype(tensor, value)
+#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value)        set_qnn_tensor_clientbuf(tensor, value)
+#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value)        set_qnn_tensor_memhandle(tensor, value)
+
+static inline int validate_tensor_version(Qnn_Tensor_t tensor) {
+    if (tensor.version != QNN_TENSOR_VERSION_1) {
+        GGMLQNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n",
+              tensor.v1.name,
+              tensor.version);
+        return 1;
+    }
+    return 0;
+}
+
+[[maybe_unused]] static inline int validate_op_config_version(Qnn_OpConfig_t op_config) {
+    if (op_config.version != QNN_OPCONFIG_VERSION_1) {
+        GGMLQNN_LOG_WARN("validate_op_config_version() op %s, got unsupported version %d\n",
+              op_config.v1.name,
+              op_config.version);
+        return 1;
+    }
+    return 0;
+}
+
+static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.name;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_oponfig_name(*op_config);
+}
+
+static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.packageName;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_packagename(*op_config);
+}
+
+static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.typeName;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_typename(*op_config);
+}
+
+static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.numOfParams;
+    }
+    return 0u;
+}
+
+[[maybe_unused]] static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_numparams(*op_config);
+}
+
+static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.params;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_params(*op_config);
+}
+
+static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.numOfInputs;
+    }
+    return 0u;
+}
+
+[[maybe_unused]] static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_numinputs(*op_config);
+}
+
+static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.inputTensors;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_inputs(*op_config);
+}
+
+static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.numOfOutputs;
+    }
+    return 0u;
+}
+
+[[maybe_unused]] static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_numoutputs(*op_config);
+}
+
+static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.outputTensors;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_outputs(*op_config);
+}
+
+static inline void set_qnn_op_config_name(Qnn_OpConfig_t & op_config, const char * name) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        op_config.v1.name = name;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_op_config_name(Qnn_OpConfig_t * op_config, const char * name) {
+    set_qnn_op_config_name(*op_config, name);
+}
+
+static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t & op_config, const char * package_name) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        op_config.v1.packageName = package_name;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t * op_config, const char * package_name) {
+    set_qnn_op_config_packagename(*op_config, package_name);
+}
+
+static inline void set_qnn_op_config_typename(Qnn_OpConfig_t & op_config, const char * type_name) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        op_config.v1.typeName = type_name;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_op_config_typename(Qnn_OpConfig_t * op_config, const char * type_name) {
+    set_qnn_op_config_typename(*op_config, type_name);
+}
+
+static inline void set_qnn_op_config_params(Qnn_OpConfig_t & op_config,
+                                 uint32_t num_of_params,
+                                 Qnn_Param_t * params) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        op_config.v1.numOfParams = num_of_params;
+        op_config.v1.params      = params;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_op_config_params(Qnn_OpConfig_t * op_config,
+                                 uint32_t num_of_params,
+                                 Qnn_Param_t * params) {
+    set_qnn_op_config_params(*op_config, num_of_params, params);
+}
+
+static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t & op_config,
+                                 uint32_t num_of_inputs,
+                                 Qnn_Tensor_t * input_tensors) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        op_config.v1.numOfInputs  = num_of_inputs;
+        op_config.v1.inputTensors = input_tensors;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t * op_config,
+                                 uint32_t num_of_inputs,
+                                 Qnn_Tensor_t * input_tensors) {
+    set_qnn_op_config_inputs(*op_config, num_of_inputs, input_tensors);
+}
+
+static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t & op_config,
+                                  uint32_t num_of_outputs,
+                                  Qnn_Tensor_t * output_tensors) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        op_config.v1.numOfOutputs  = num_of_outputs;
+        op_config.v1.outputTensors = output_tensors;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t * op_config,
+                                  uint32_t num_of_outputs,
+                                  Qnn_Tensor_t * output_tensors) {
+    set_qnn_op_config_outputs(*op_config, num_of_outputs, output_tensors);
+}
+
+static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.id;
+    }
+
+    return 0u;
+}
+
+[[maybe_unused]] static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensorid(*tensor);
+}
+
+static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.name;
+    }
+    return nullptr;
+}
+
+static inline const char * get_qnn_tensorname(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensorname(*tensor);
+}
+
+static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.type;
+    }
+    return QNN_TENSOR_TYPE_UNDEFINED;
+}
+
+[[maybe_unused]] static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensortype(*tensor);
+}
+
+static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.dataFormat;
+    }
+    return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER;
+}
+
+[[maybe_unused]] static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_dataformat(*tensor);
+}
+
+static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.dataType;
+    }
+    return QNN_DATATYPE_UNDEFINED;
+}
+
+[[maybe_unused]] static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_datatype(*tensor);
+}
+
+static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.quantizeParams;
+    }
+    return QNN_QUANTIZE_PARAMS_INIT;
+}
+
+[[maybe_unused]] static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_quantparams(*tensor);
+}
+
+static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.rank;
+    }
+    return 0u;
+}
+
+[[maybe_unused]] static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_rank(*tensor);
+}
+
+static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.dimensions;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_dimensions(*tensor);
+}
+
+static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.memType;
+    }
+    return QNN_TENSORMEMTYPE_UNDEFINED;
+}
+
+[[maybe_unused]] static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_memtype(*tensor);
+}
+
+static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.clientBuf;
+    }
+    return QNN_CLIENT_BUFFER_INIT;
+}
+
+[[maybe_unused]] static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_clientbuf(*tensor);
+}
+
+static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.memHandle;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_memhandle(*tensor);
+}
+
+static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.id = id;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_id(Qnn_Tensor_t * tensor, uint32_t id) {
+    set_qnn_tensor_id(*tensor, id);
+}
+
+static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.name = name;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_name(Qnn_Tensor_t * tensor, const char * name) {
+    set_qnn_tensor_name(*tensor, name);
+}
+
+static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.type = type;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_type(Qnn_Tensor_t * tensor, Qnn_TensorType_t type) {
+    set_qnn_tensor_type(*tensor, type);
+}
+
+static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.dataFormat = format;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t * tensor, Qnn_TensorDataFormat_t format) {
+    set_qnn_tensor_dataformat(*tensor, format);
+}
+
+static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.dataType = dataType;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_datatype(Qnn_Tensor_t * tensor, Qnn_DataType_t dataType) {
+    set_qnn_tensor_datatype(*tensor, dataType);
+}
+
+static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.quantizeParams = params;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t * tensor, Qnn_QuantizeParams_t params) {
+    set_qnn_tensor_quantparams(*tensor, params);
+}
+
+static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.rank = rank;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_rank(Qnn_Tensor_t * tensor, uint32_t rank) {
+    set_qnn_tensor_rank(*tensor, rank);
+}
+
+static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.dimensions = dims;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t * tensor, uint32_t * dims) {
+    set_qnn_tensor_dimensions(*tensor, dims);
+}
+
+static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.memType = memType;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_memtype(Qnn_Tensor_t * tensor, Qnn_TensorMemType_t memType) {
+    set_qnn_tensor_memtype(*tensor, memType);
+}
+
+static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.clientBuf = clientBuf;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t * tensor, Qnn_ClientBuffer_t clientBuf) {
+    set_qnn_tensor_clientbuf(*tensor, clientBuf);
+}
+
+static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.memHandle = handle;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t * tensor, Qnn_MemHandle_t handle) {
+    set_qnn_tensor_memhandle(*tensor, handle);
+}
+
+inline static Qnn_Tensor_t qnn_tensor_init(Qnn_TensorVersion_t version) {
+    Qnn_Tensor_t tensor;
+    tensor.version = version;
+    if (version == QNN_TENSOR_VERSION_1) {
+        tensor.v1 = QNN_TENSOR_V1_INIT;
+    } else if (version == QNN_TENSOR_VERSION_2) {
+        tensor.v2 = QNN_TENSOR_V2_INIT;
+    }
+    return tensor;
+}
+
+static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
+    int err = 0;
+    VALIDATE_TENSOR_VERSION(src, err);
+
+    dst.version = src.version;
+    QNN_TENSOR_SET_NAME(
+            dst, ggmlqnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size()));
+    if (QNN_TENSOR_GET_NAME(dst) == nullptr) {
+        return 1;
+    }
+    QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src));
+    QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src));
+    QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src));
+    QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src));
+    QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src));
+
+    if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) {
+        Qnn_ClientBuffer_t client_buf = {nullptr, 0};
+        QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf);
+    } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) {
+        QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr);
+    } else {
+        return 1;
+    }
+
+    Qnn_QuantizeParams_t src_qparam      = QNN_TENSOR_GET_QUANT_PARAMS(src);
+    Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding;
+    if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+        Qnn_QuantizeParams_t src_qparam_cpy      = src_qparam;
+        Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding;
+        Qnn_ScaleOffset_t ** scale_offset          = &axis_scale_offset.scaleOffset;
+        size_t scale_offset_size = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t);
+        *scale_offset           = (Qnn_ScaleOffset_t *)malloc(scale_offset_size);
+        ggmlqnn_memscpy(*scale_offset,
+                        scale_offset_size,
+                        src_qparam.axisScaleOffsetEncoding.scaleOffset,
+                        scale_offset_size);
+        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy);
+    } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
+        Qnn_QuantizeParams_t src_qparam_cpy          = src_qparam;
+        Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding;
+        size_t scale_size                          = bwaxis_scale_offset.numElements * sizeof(float);
+        float ** scales                            = &bwaxis_scale_offset.scales;
+        int32_t ** offsets                         = &bwaxis_scale_offset.offsets;
+        *scales                                    = (float *)malloc(scale_size);
+        ggmlqnn_memscpy(*scales, scale_size, src_qparam.bwAxisScaleOffsetEncoding.scales, scale_size);
+
+        if (bwaxis_scale_offset.offsets != nullptr) {
+            size_t offset_size = bwaxis_scale_offset.numElements * sizeof(int32_t);
+            *offsets           = (int32_t *)malloc(offset_size);
+            ggmlqnn_memscpy(*offsets, offset_size, src_qparam.bwAxisScaleOffsetEncoding.offsets, offset_size);
+        }
+        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy);
+    } else {
+        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam);
+    }
+
+    uint32_t rank = QNN_TENSOR_GET_RANK(src);
+    QNN_TENSOR_SET_RANK(dst, rank);
+    size_t dim_size       = rank * sizeof(uint32_t);
+    uint32_t * dimensions = (uint32_t *)malloc(dim_size);
+    GGMLQNN_LOG_DEBUG("tensor dims %p", dimensions);
+    if (dimensions == nullptr) {
+        GGMLQNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src));
+        return 1;
+    }
+    ggmlqnn_memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size);
+    QNN_TENSOR_SET_DIMENSIONS(dst, dimensions);
+
+    return err;
+}
+
+static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
+    int err = 0;
+    VALIDATE_TENSOR_VERSION(*tensor, err);
+    free((void *) QNN_TENSOR_GET_NAME(*tensor));
+
+    Qnn_QuantizeParams_t src_qparam      = QNN_TENSOR_GET_QUANT_PARAMS(*tensor);
+    Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding;
+    if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+        free(src_qparam.axisScaleOffsetEncoding.scaleOffset);
+    } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
+        free(src_qparam.bwAxisScaleOffsetEncoding.scales);
+        if (src_qparam.bwAxisScaleOffsetEncoding.offsets != nullptr) {
+            free(src_qparam.bwAxisScaleOffsetEncoding.offsets);
+        }
+    }
+    //GGMLQNN_LOG_DEBUG("tensor dims %p", QNN_TENSOR_GET_DIMENSIONS(*tensor));
+    free(QNN_TENSOR_GET_DIMENSIONS(*tensor));
+    free(tensor);
+
+    return err;
+}
+
+
+static size_t qnn_datatype_size(Qnn_DataType_t qnn_type) {
+    switch (qnn_type) {
+        case QNN_DATATYPE_FLOAT_32:
+            return sizeof(float);
+        case QNN_DATATYPE_FLOAT_16:
+            return sizeof(uint16_t);
+        case QNN_DATATYPE_UINT_32:
+        case QNN_DATATYPE_INT_32:
+            return sizeof(int32_t);
+        case QNN_DATATYPE_INT_16:
+            return sizeof(int16_t);
+        case QNN_DATATYPE_INT_8:
+            return sizeof(int8_t);
+        case QNN_DATATYPE_SFIXED_POINT_8:
+            return sizeof(int8_t);
+        case QNN_DATATYPE_SFIXED_POINT_4:
+            return sizeof(int8_t);
+        default:
+            break;
+    }
+    return 0;
+}
+
+static const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type) {
+    switch (qnn_type) {
+        case QNN_DATATYPE_FLOAT_32:
+            return "QNN_DATATYPE_FLOAT_32";
+        case QNN_DATATYPE_FLOAT_16:
+            return "QNN_DATATYPE_FLOAT_16";
+        case QNN_DATATYPE_UINT_32:
+            return "QNN_DATATYPE_UINT_32";
+        case QNN_DATATYPE_INT_32:
+            return "QNN_DATATYPE_INT_32";
+        case QNN_DATATYPE_INT_16:
+            return "QNN_DATATYPE_INT_16";
+        case QNN_DATATYPE_INT_8:
+            return "QNN_DATATYPE_INT_8";
+        case QNN_DATATYPE_SFIXED_POINT_8:
+            return "QNN_DATATYPE_SFIXED_POINT_8";
+        case QNN_DATATYPE_SFIXED_POINT_4:
+            return "QNN_DATATYPE_SFIXED_POINT_4";
+        default:
+            break;
+    }
+    return "QNN_DATATYPE_UNDEFINED";
+}
+
+static const char * qnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
+    // file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/api_error_codes.html
+    switch (qnn_error_code) {
+        case QNN_SUCCESS:
+            return "QNN_SUCCESS";
+        case QNN_COMMON_ERROR_GENERAL:
+            return "QNN_COMMON_ERROR_GENERAL";
+
+            // QnnGraph_Error_t
+        case QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE:
+            return "QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE";
+        case QNN_GRAPH_ERROR_MEM_ALLOC:
+            return "QNN_GRAPH_ERROR_MEM_ALLOC";
+        case QNN_GRAPH_ERROR_INVALID_ARGUMENT:
+            return "QNN_GRAPH_ERROR_INVALID_ARGUMENT";
+        case QNN_GRAPH_ERROR_INVALID_HANDLE:
+            return "QNN_GRAPH_ERROR_INVALID_HANDLE";
+        case QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST:
+            return "QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST";
+        case QNN_GRAPH_ERROR_INVALID_NAME:
+            return "QNN_GRAPH_ERROR_INVALID_NAME";
+        case QNN_GRAPH_ERROR_INVALID_TENSOR:
+            return "QNN_GRAPH_ERROR_INVALID_TENSOR";
+        case QNN_GRAPH_ERROR_INVALID_OP_CONFIG:
+            return "QNN_GRAPH_ERROR_INVALID_OP_CONFIG";
+        case QNN_GRAPH_ERROR_SET_PROFILE:
+            return "QNN_GRAPH_ERROR_SET_PROFILE";
+        case QNN_GRAPH_ERROR_UNCONNECTED_NODE:
+            return "QNN_GRAPH_ERROR_UNCONNECTED_NODE";
+        case QNN_GRAPH_ERROR_CREATE_FAILED:
+            return "QNN_GRAPH_ERROR_CREATE_FAILED";
+        case QNN_GRAPH_ERROR_OPTIMIZATION_FAILED:
+            return "QNN_GRAPH_ERROR_OPTIMIZATION_FAILED";
+        case QNN_GRAPH_ERROR_FINALIZE_FAILED:
+            return "QNN_GRAPH_ERROR_FINALIZE_FAILED";
+        case QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED:
+            return "QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED";
+        case QNN_GRAPH_ERROR_GRAPH_FINALIZED:
+            return "QNN_GRAPH_ERROR_GRAPH_FINALIZED";
+        case QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL:
+            return "QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL";
+        case QNN_GRAPH_ERROR_SIGNAL_IN_USE:
+            return "QNN_GRAPH_ERROR_SIGNAL_IN_USE";
+        case QNN_GRAPH_ERROR_ABORTED:
+            return "QNN_GRAPH_ERROR_ABORTED";
+        case QNN_GRAPH_ERROR_PROFILE_IN_USE:
+            return "QNN_GRAPH_ERROR_PROFILE_IN_USE";
+        case QNN_GRAPH_ERROR_TIMED_OUT:
+            return "QNN_GRAPH_ERROR_TIMED_OUT";
+        case QNN_GRAPH_ERROR_SUBGRAPH:
+            return "QNN_GRAPH_ERROR_SUBGRAPH";
+        case QNN_GRAPH_ERROR_DISABLED:
+            return "QNN_GRAPH_ERROR_DISABLED";
+        case QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE:
+            return "QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE";
+        case QNN_GRAPH_ERROR_TENSOR_SPARSITY:
+            return "QNN_GRAPH_ERROR_TENSOR_SPARSITY";
+        case QNN_GRAPH_ERROR_EARLY_TERMINATION:
+            return "QNN_GRAPH_ERROR_EARLY_TERMINATION";
+        case QNN_GRAPH_ERROR_INVALID_CONTEXT:
+            return "QNN_GRAPH_ERROR_INVALID_CONTEXT";
+
+            //QQnnTensor_Error_t
+            //Invalid context/graph handle in creating tensor
+        case QNN_TENSOR_ERROR_INVALID_HANDLE:
+            return "QNN_TENSOR_ERROR_INVALID_HANDLE";
+            //Tensor with specified credentials not registered with a context/graph
+        case QNN_TENSOR_ERROR_DOES_NOT_EXIST:
+            return "QNN_TENSOR_ERROR_DOES_NOT_EXIST";
+            // (deprecated) Tensor has already been registered with backend
+        case QNN_TENSOR_ERROR_ALREADY_EXISTS:
+            return "QNN_TENSOR_ERROR_ALREADY_EXISTS";
+            // Invalid tensor param.
+        case QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM:
+            return "QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM";
+            // This tensor param is currently unsupported
+        case QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM:
+            return "QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM";
+            // Tensor provided for update is invalid
+        case QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE:
+            return "QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE";
+
+            // QnnOpPackage_Error_t
+        case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED:
+            return "QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED";
+        case QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED:
+            return "QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED";
+        case QNN_OP_PACKAGE_ERROR_INVALID_HANDLE:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_HANDLE";
+        case QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE";
+        case QNN_OP_PACKAGE_ERROR_INVALID_INFO:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_INFO";
+        case QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE:
+            return "QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE";
+        case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT";
+
+        default:
+            return "unknown QNN error";
+    }
+}
+
+// =================================================================================================
+//  section-5:ggml-qnn backend helper macro / data structure / function / class
+// =================================================================================================
+#define RPCMEM_DEFAULT_FLAGS                    1
+#define RPCMEM_HEAP_ID_SYSTEM                   25
+
+typedef void (* ggmlqnn_op_func_t)(ggml_backend_t backend, ggml_tensor * op);
+
+using pfn_rpc_mem_init                                  = void (*)(void);
+using pfn_rpc_mem_deinit                                = void (*)(void);
+using pfn_rpc_mem_alloc                                 = void *(*)(int, uint32_t, int);
+using pfn_rpc_mem_free                                  = void (*)(void *);
+using pfn_rpc_mem_to_fd                                 = int (*)(void *);
+using _pfn_QnnSaver_initialize                          = decltype(QnnSaver_initialize);
+using _pfn_QnnInterface_getProviders                    = decltype(QnnInterface_getProviders);
+using _pfn_QnnSystemInterface_getProviders              = decltype(QnnSystemInterface_getProviders);
+
+enum class ggml_qnn_profile_level {
+    profile_off     = 0,
+    profile_basic   = 1,
+    profile_detail  = 2
+};
+
+enum qcom_htp_arch {
+    NONE = 0,
+    V68 = 68,
+    V69 = 69,
+    V73 = 73,
+    V75 = 75,
+    V79 = 79,
+};
+
+enum qcom_chipset_soc_model {
+    UNKNOWN_SM = 0,
+    SM7450 = 41,  // v69, 7 Gen1
+    SM8350 = 30,  // v68, 888
+    SM8450 = 36,  // v69, SD 8 Gen 1
+    SM8475 = 42,  // v69, SD 8+ Gen 1
+    SM8550 = 43,  // v73, SD 8 Gen 2
+    SM8650 = 57,  // v75, SD 8 Gen 3
+    SM8750 = 69,  // v79, SD 8 Gen 4
+};
+
+struct qcom_socinfo {
+    uint32_t soc_model;
+    size_t htp_arch;
+    size_t vtcm_size_in_mb;
+    char soc_desc[GGML_MAX_NAME];
+};
+
+//file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
+static struct qcom_socinfo g_qnn_soc_info_table[] = {
+        /* Qualcomm SnapDragon 7 Gen 1 */
+        [SM7450] = {
+                .soc_model         = SM7450,
+                .htp_arch          = V69,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 7 Gen 1"},
+
+        /* Qualcomm SnapDragon 888 */
+        [SM8350] = {
+                .soc_model         = SM8350,
+                .htp_arch          = V68,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 888 "},
+
+        /* Qualcomm SnapDragon 8 Gen 1 */
+        [SM8450] = {
+                .soc_model         = SM8450,
+                .htp_arch          = V69,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 1"},
+
+        /* Qualcomm SnapDragon 8 Gen 1+ */
+        [SM8475] = {
+                .soc_model         = SM8475,
+                .htp_arch          = V69,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 1+"},
+
+        /* Qualcomm SnapDragon 8 Gen 2 */
+        [SM8550] = {
+                .soc_model         = SM8550,
+                .htp_arch          = V73,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 2"},
+
+        /* Qualcomm SnapDragon 8 Gen 3 */
+        [SM8650] = {
+                .soc_model         = SM8650,
+                .htp_arch          = V75,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 3 "},
+
+        /* Qualcomm SnapDragon 8 Gen 4 */
+        [SM8750] = {
+                .soc_model         = SM8750,
+                .htp_arch          = V79,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 4"},
+
+};
+
+struct ggml_backend_qnn_context {
+    int device;
+    int threads;
+    char name[GGML_MAX_NAME];
+    char desc[GGML_MAX_NAME];
+    char lib[GGML_MAX_NAME];
+    qnn_instance * instance;
+    struct ggml_backend * backend;
+    QNN_INTERFACE_VER_TYPE raw_interface;
+    QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
+    struct qcom_socinfo           socinfo;
+
+    //FIXME: should I move it from public member of class qnn_instance to here?
+    //std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *, Qnn_Tensor_t *, Qnn_Tensor_t *>> _qnn_graph_map;
+} ;
+
+//FIXME: the following global vars and three helper funcs should be removed in the future
+static int32_t  g_ggmltensor_idx    = 0;
+static void reset_idx() {
+    g_ggmltensor_idx = 0;
+}
+
+static void inc_idx() {
+    g_ggmltensor_idx++;
+}
+
+static int32_t get_idx() {
+    return g_ggmltensor_idx;
+}
+
+// file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/quantization.html
+// CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend
+// GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend
+// HTP - Choose a quantized model. Quantized models are required when running on the HTP backend
+// DSP - Choose a quantized model. Quantized models are required when running on the DSP backend
+// HTA - Choose a quantized model. Quantized models are required when running on the HTA backend
+static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
+        [QNN_BACKEND_CPU] = {.device               = 0,
+                .threads              = 1,
+                .name                 = "qnn-cpu",
+                .desc                 = "Qualcomm Kryo CPU",
+                .lib                  = "libQnnCpu.so",
+                .instance             = nullptr,
+                .backend              = nullptr,
+                .raw_interface        = {},
+                .raw_system_interface = {},
+                .socinfo              = {}},
+
+        [QNN_BACKEND_GPU] = {.device               = 1,
+                .threads              = 1,
+                .name                 = "qnn-gpu",
+                .desc                 = "Qualcomm Adreno GPU",
+                .lib                  = "libQnnGpu.so",
+                .instance             = nullptr,
+                .backend              = nullptr,
+                .raw_interface        = {},
+                .raw_system_interface = {},
+                .socinfo              = {}},
+
+        [QNN_BACKEND_NPU] = {.device               = 2,
+                .threads              = 1,
+                .name                 = "qnn-npu",
+                .desc                 = "Qualcomm NPU(Hexagon Tensor Processor)",
+                .lib                  = "libQnnHtp.so",
+                .instance             = nullptr,
+                .backend              = nullptr,
+                .raw_interface        = {},
+                .raw_system_interface = {},
+                .socinfo              = {}},
+};
+
+using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS];
+using qnn_dimension_array_t = std::array<uint32_t, GGML_MAX_DIMS>;
+using op_dims_calc_func_t = void (*)(const std::vector<const ggml_dimension_array_t> & input_dims,
+                                     ggml_dimension_array_t & output_dims);
+
+static void element_wise_op_dims(const std::vector<const ggml_dimension_array_t> & input_dims,
+                                 ggml_dimension_array_t &output_dims) {
+    for (size_t i = 1; i < std::size(output_dims); i++) {
+        output_dims[i] = input_dims.front()[i];
+    }
+}
+
+static void mat_mul_op_dims(const std::vector<const ggml_dimension_array_t> & input_dims,
+                            ggml_dimension_array_t & output_dims) {
+    GGML_ASSERT(input_dims.size() == 2);
+    output_dims[0] = input_dims.front()[1];
+    output_dims[1] = input_dims.back()[1];
+}
+
+struct qnn_op_caps_t {
+    const char * qnn_op_name = nullptr;
+    const size_t input_param_count = 0;
+    op_dims_calc_func_t calc_dims_func = nullptr;
+    const char * qnn_param_name = nullptr;
+};
+
+constexpr static const qnn_op_caps_t kOpCaps[] = {
+        {}, // GGML_OP_NONE
+        {}, // GGML_OP_DUP
+        {
+                // GGML_OP_ADD
+                QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name
+                2,                       // input_param_count
+                element_wise_op_dims,    // calc_dims_func
+        },
+        {}, // GGML_OP_ADD1
+        {}, // GGML_OP_ACC
+        {}, // GGML_OP_SUB
+        {}, // GGML_OP_MUL
+        {}, // GGML_OP_DIV
+        {}, // GGML_OP_SQR
+        {}, // GGML_OP_SQRT
+        {}, // GGML_OP_LOG
+        {}, // GGML_OP_SIN
+        {}, // GGML_OP_COS
+        {}, // GGML_OP_SUM
+        {}, // GGML_OP_SUM_ROWS
+        {}, // GGML_OP_MEAN
+        {}, // GGML_OP_ARGMAX
+        {}, // GGML_OP_COUNT_EQUAL
+        {}, // GGML_OP_REPEAT
+        {}, // GGML_OP_REPEAT_BACK
+        {}, // GGML_OP_CONCAT
+        {}, // GGML_OP_SILU_BACK
+        {}, // GGML_OP_NORM
+        {}, // GGML_OP_RMS_NORM
+        {}, // GGML_OP_RMS_NORM_BACK
+        {}, // GGML_OP_GROUP_NORM
+        {
+                // GGML_OP_MUL_MAT
+                QNN_OP_MAT_MUL,  // qnn_op_name
+                2,               // input_param_count
+                mat_mul_op_dims, // calc_dims_func
+        },
+        {}, // GGML_OP_MUL_MAT_ID
+        {}, // GGML_OP_OUT_PROD
+        {}, // GGML_OP_SCALE
+        {}, // GGML_OP_SET
+        {}, // GGML_OP_CPY
+        {}, // GGML_OP_CONT
+        {}, // GGML_OP_RESHAPE
+        {}, // GGML_OP_VIEW
+        {}, // GGML_OP_PERMUTE
+        {}, // GGML_OP_TRANSPOSE
+        {}, // GGML_OP_GET_ROWS
+        {}, // GGML_OP_GET_ROWS_BACK
+        {}, // GGML_OP_DIAG
+        {}, // GGML_OP_DIAG_MASK_INF
+        {}, // GGML_OP_DIAG_MASK_ZERO
+        {}, // GGML_OP_SOFT_MAX
+        {}, // GGML_OP_SOFT_MAX_BACK
+        {}, // GGML_OP_ROPE
+        {}, // GGML_OP_ROPE_BACK
+        {}, // GGML_OP_CLAMP
+        {}, // GGML_OP_CONV_TRANSPOSE_1D
+        {}, // GGML_OP_IM2COL
+        {}, // GGML_OP_IM2COL_BACK
+        {}, // GGML_OP_CONV_TRANSPOSE_2D
+        {}, // GGML_OP_POOL_1D
+        {}, // GGML_OP_POOL_2D
+        {}, // GGML_OP_POOL_2D_BACK
+        {}, // GGML_OP_UPSCALE
+        {}, // GGML_OP_PAD
+        {}, // GGML_OP_PAD_REFLECT_1D
+        {}, // GGML_OP_ARANGE
+        {}, // GGML_OP_TIMESTEP_EMBEDDING
+        {}, // GGML_OP_ARGSORT
+        {}, // GGML_OP_LEAKY_RELU
+        {}, // GGML_OP_FLASH_ATTN_EXT
+        {}, // GGML_OP_FLASH_ATTN_BACK
+        {}, // GGML_OP_SSM_CONV
+        {}, // GGML_OP_SSM_SCAN
+        {}, // GGML_OP_WIN_PART
+        {}, // GGML_OP_WIN_UNPART
+        {}, // GGML_OP_GET_REL_POS
+        {}, // GGML_OP_ADD_REL_POS
+        {}, // GGML_OP_RWKV_WKV6
+        {}, // GGML_OP_GATED_LINEAR_ATTN
+        {}, // GGML_OP_UNARY
+        {}, // GGML_OP_MAP_UNARY
+        {}, // GGML_OP_MAP_BINARY
+        {}, // GGML_OP_MAP_CUSTOM1_F32
+        {}, // GGML_OP_MAP_CUSTOM2_F32
+        {}, // GGML_OP_MAP_CUSTOM3_F32
+        {}, // GGML_OP_MAP_CUSTOM1
+        {}, // GGML_OP_MAP_CUSTOM2
+        {}, // GGML_OP_MAP_CUSTOM3
+        {}, // GGML_OP_CROSS_ENTROPY_LOSS
+        {}, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
+        {}, // GGML_OP_OPT_STEP_ADAMW
+        {}, // GGML_UNARY_OP_ABS
+        {}, // GGML_UNARY_OP_SGN
+        {}, // GGML_UNARY_OP_NEG
+        {}, // GGML_UNARY_OP_STEP
+        {}, // GGML_UNARY_OP_TANH
+        {}, // GGML_UNARY_OP_ELU
+        {}, // GGML_UNARY_OP_RELU
+        {}, // GGML_UNARY_OP_SIGMOID
+        {}, // GGML_UNARY_OP_GELU
+        {}, // GGML_UNARY_OP_GELU_QUICK
+        {}, // GGML_UNARY_OP_SILU
+        {}, // GGML_UNARY_OP_HARDSWISH
+        {}, // GGML_UNARY_OP_HARDSIGMOID
+        {}, // GGML_UNARY_OP_EXP
+};
+
+static const char * qnn_get_socmodel_desc(uint32_t soc_model) {
+    switch (soc_model) {
+        case SM7450:
+            return "SM7450";
+        case SM8350:
+            return "SM8350";
+        case SM8450:
+            return "SM8450";
+        case SM8475:
+            return "SM8475";
+        case SM8550:
+            return "SM8550";
+        case SM8650:
+            return "SM8650";
+        case SM8750:
+            return "SM8750";
+        default:
+            return "unknown";
+    }
+}
+
+static const char * qnn_get_htparch_desc(size_t htp_arch) {
+    switch (htp_arch) {
+        case V68:
+            return "QCOM_HTP_V68";
+        case V69:
+            return "QCOM_HTP_V69";
+        case V73:
+            return "QCOM_HTP_V73";
+        case V75:
+            return "QCOM_HTP_V75";
+        case V79:
+            return "QCOM_HTP_V79";
+        default:
+            return "unknown";
+    }
+}
+
+static struct qcom_socinfo * qnn_get_socinfo_from_socmodel(uint32_t soc_model) {
+    size_t items = sizeof(g_qnn_soc_info_table) / sizeof(g_qnn_soc_info_table[0]);
+    for (size_t idx = 0; idx < items; idx++) {
+        if (soc_model == g_qnn_soc_info_table[idx].soc_model) {
+            return &g_qnn_soc_info_table[idx];
+        }
+    }
+    return nullptr;
+}
+
+static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
+                                const ggml_tensor * src1, ggml_tensor * dst) {
+    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
+        GGMLQNN_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    qnn_instance * instance = ctx->instance;
+    if (nullptr == instance) {
+        GGMLQNN_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    return true;
+}
+
+#define CHECK_PARAMS(ctx, src0, src1, dst)                          \
+    do {                                                            \
+        if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) {   \
+            return;                                                 \
+        }                                                           \
+    } while (0)
+
+static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) {
+    /*
+    uint32_t rank = 0;
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) {
+            rank++;
+        }
+    }
+    return rank;
+    */
+    return ggml_n_dims(tensor);
+}
+
+static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) {
+    /*
+    size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
+    size_t n_dims = ggml_get_tensor_rank(tensor);
+    for (int i = 1; i < n_dims; i++) {
+        data_size *= tensor->ne[i];
+    }
+
+    return data_size;
+    */
+    return ggml_nbytes(tensor);
+}
+
+static const char * ggml_get_type_name(ggml_type type) {
+    const struct ggml_type_traits * traits = ggml_get_type_traits(type);
+    return traits->type_name;
+}
+
+Qnn_Tensor_t * ggml_qnn_create_tensor(const ggml_tensor * tensor) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    char tensor_name[GGML_MAX_NAME] = {0};
+
+    //FIXME:remove get_idx() and inc_idx() in the future but ensure the tensor name is unique
+    snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", get_idx());
+    GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
+    inc_idx();
+
+    uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
+                             (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
+    Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
+    Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+
+    if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
+        qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+    } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
+        qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
+    }
+    Qnn_Tensor_t qnn_tensor = {
+            .version= QNN_TENSOR_VERSION_1,
+            {.v1= {
+                    .id = 0,
+                    .name = tensor_name,
+                    .type = qnn_tensor_type,
+                    .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
+                    .dataType = qnn_data_type,
+                    .quantizeParams = {QNN_DEFINITION_UNDEFINED,
+                                       QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                                       {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}},
+                    .rank = ggml_get_tensor_rank(tensor),
+                    .dimensions = dimensions,
+                    .memType = QNN_TENSORMEMTYPE_RAW,
+                    {.clientBuf = {.data = nullptr,
+                            .dataSize = 0}}}}
+    };
+    Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
+    if (nullptr == p_qnn_tensor) {
+        GGMLQNN_LOG_WARN("calloc failed");
+        return nullptr;
+    }
+    error = deep_copy_qnn_tensors(qnn_tensor, * p_qnn_tensor);
+    if (error != QNN_SUCCESS) {
+        free(p_qnn_tensor);
+        GGMLQNN_LOG_WARN("init tensor failed");
+        return  nullptr;
+    }
+
+    return p_qnn_tensor;
+}
+
+//TODO:
+// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
+static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
+    switch (ggmltype) {
+        case GGML_TYPE_F16:
+            return QNN_DATATYPE_FLOAT_16;
+        case GGML_TYPE_F32:
+            return QNN_DATATYPE_FLOAT_32;
+        case GGML_TYPE_I8:
+            return QNN_DATATYPE_INT_8;
+        case GGML_TYPE_Q8_0:
+            return QNN_DATATYPE_SFIXED_POINT_8;
+        case GGML_TYPE_Q4_0:
+            return QNN_DATATYPE_SFIXED_POINT_4;
+        default:
+            break;
+    }
+    return QNN_DATATYPE_UNDEFINED;
+}
+
+//TODO:
+static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
+    switch (qnn_type) {
+        case QNN_DATATYPE_FLOAT_32:
+            return GGML_TYPE_F32;
+        case QNN_DATATYPE_FLOAT_16:
+            return GGML_TYPE_F16;
+        case QNN_DATATYPE_UINT_32:
+        case QNN_DATATYPE_INT_32:
+            return GGML_TYPE_I32;
+        case QNN_DATATYPE_INT_16:
+            return GGML_TYPE_I16;
+        case QNN_DATATYPE_INT_8:
+            return GGML_TYPE_I8;
+        case QNN_DATATYPE_SFIXED_POINT_8:
+            return GGML_TYPE_Q8_0;
+        case QNN_DATATYPE_SFIXED_POINT_4:
+            return GGML_TYPE_Q4_0;
+        default:
+            break;
+    }
+    return GGML_TYPE_COUNT;
+}
+
+//TODO:
+static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) {
+    switch (ggmlop) {
+        case GGML_OP_ADD:
+            return QNN_OP_ELEMENT_WISE_ADD;
+        case GGML_OP_MUL_MAT:
+            return QNN_OP_MAT_MUL;
+        default:
+            break;
+    }
+    return nullptr;
+}
+
+static const char * get_ggml_type_name(ggml_type type) {
+    const auto * traits = ggml_get_type_traits(type);
+    return traits->type_name;
+}
+
+static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) {
+    char buffer[256] = {};
+    const char * type_name = get_ggml_type_name(tensor->type);
+    int len = 0;
+    switch (ggml_n_dims(tensor)) {
+        case 1:
+            len = snprintf(buffer, sizeof(buffer), "%ld%s", (long)tensor->ne[0], type_name);
+            break;
+        case 2:
+            len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name);
+            break;
+        case 3:
+            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
+                           (long)tensor->ne[2], type_name);
+            break;
+        case 4:
+        default:
+            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
+                           (long)tensor->ne[2], (long)tensor->ne[3], type_name);
+            break;
+    }
+    GGML_ASSERT(len > 0 && len < (int)sizeof(buffer));
+    output.append(buffer, len);
+}
+
+constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT;
+
+static size_t get_qnn_op_index(const ggml_tensor * tensor) {
+    if (tensor->op == GGML_OP_UNARY) {
+        return kGgmlUnaryOpStart + ggml_get_unary_op(tensor);
+    }
+
+    return tensor->op;
+}
+
+static size_t get_qnn_op_input_param_count(const ggml_tensor * op) {
+    auto op_index = get_qnn_op_index(op);
+    GGML_ASSERT(op_index < std::size(kOpCaps));
+    return kOpCaps[op_index].input_param_count;
+}
+
+static void get_graph_key_from_op(const ggml_tensor * op, std::string & output) {
+    GGML_ASSERT(op->op != GGML_OP_NONE);
+    output += ggml_op_desc(op);
+    output += get_ggml_type_name(op->type);
+    size_t param_count = get_qnn_op_input_param_count(op);
+    for (size_t i = 0; i < param_count; ++i) {
+        auto * input = op->src[i];
+        if (!input) {
+            break;
+        }
+        output += '_';
+        append_tensor_dimensions(input, output);
+    }
+}
+
+#if ENABLE_QNNBACKEND_PERF
+class qnn_perf {
+public:
+    qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
+    qnn_perf() = delete;
+    qnn_perf(const qnn_perf & ) = delete;
+    qnn_perf & operator= (const qnn_perf & ) = delete;
+
+    void start() {
+        _begin_time = ggml_time_us();
+    }
+
+    void info() {
+        _end_time = ggml_time_us();
+        _duration = (_end_time - _begin_time);
+        GGMLQNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
+    }
+
+private:
+    int64_t _begin_time = 0LL;
+    int64_t _end_time   = 0LL;
+    int64_t _duration   = 0LL;
+    std::string _perf_name;
+};
+#else
+class qnn_perf {
+public:
+    qnn_perf(const std::string & perf_name) {}
+    qnn_perf() = delete;
+    qnn_perf(const qnn_perf & ) = delete;
+    qnn_perf & operator= (const qnn_perf & ) = delete;
+
+    void start() {}
+    void info() {}
+};
+#endif
+
+template<typename Fn>
+Fn load_qnn_functionpointers(void * handle, const char * function_name) {
+    return reinterpret_cast<Fn>(dlsym(handle, function_name));
+}
+
+class qnn_interface {
+
+#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name)           \
+  template <typename... Args>                                     \
+  inline auto qnn_##F(Args... args) const {                       \
+    return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \
+        std::forward<Args>(args)...);                             \
+  }
+
+
+#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name)                  \
+  template <typename... Args>                                                \
+  inline auto qnn_##F(Args... args) const {                                  \
+    return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \
+        std::forward<Args>(args)...);                                        \
+  }
+
+    friend class qnn_instance;
+
+public:
+    qnn_interface() = default;
+
+    // QnnBackend
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion);
+
+    // QnnDevice
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo);
+
+    // QnnContext
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree);
+
+    // QnnGraph
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve);
+
+    // QnnLog
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel);
+
+    // QnnProfile
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree);
+
+    // QnnMem
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister);
+
+    // QnnProperty
+    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability);
+
+    // QnnTensor
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor);
+
+    // QnnSystem
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate);
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo);
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree);
+
+    void set_qnn_interface(const QnnInterface_t * qnn_interface) {
+        _qnn_interface = qnn_interface;
+    }
+
+    void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) {
+        _qnn_sys_interface = qnn_sys_interface;
+    }
+
+    uint32_t get_backend_id() const {
+        return _qnn_interface->backendId;
+    }
+
+    bool is_loaded() const {
+        return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr));
+    }
+
+private:
+    const QnnInterface_t *_qnn_interface = nullptr;
+
+    const QnnSystemInterface_t *_qnn_sys_interface = nullptr;
+};
+
+class qnn_instance {
+public:
+    using BackendIdType = decltype(QnnInterface_t{}.backendId);
+
+    explicit qnn_instance(const std::string & lib_path, const std::string & backend_name,
+                                const std::string & model_name) :
+            _lib_path(std::move(lib_path)),
+            _backend_name(std::move(backend_name)),
+            _model_name(std::move(model_name)) {};
+
+    ~qnn_instance() {
+    }
+
+    int qnn_init(const QnnSaver_Config_t ** saver_config);
+
+    int qnn_finalize();
+
+    const qnn_interface &get_qnn_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_interface;
+    }
+
+    const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_raw_interface;
+    }
+
+    const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_raw_system_interface;
+    }
+
+    const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
+
+    const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
+
+    const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
+
+    const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
+
+    const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
+
+    const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
+
+    const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
+
+    int init_qnn_graph(const char * graph_name,
+                       bool debug,
+                       uint8_t do_node_validation = 1,
+                       const QnnGraph_Config_t ** graph_configs = nullptr
+    );
+    int init_qnn_graph(const std::string &graph_name, QNNBackend device, size_t vtcm_size_in_mb);
+
+    int finalize_qnn_graph();
+
+    bool is_valid_graph() const { return _qnn_graph_handle != nullptr; }
+
+    int init_htp_perfinfra() {
+        QnnDevice_Infrastructure_t device_infra = nullptr;
+        int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to get qnn device infra\n");
+            return 1;
+        }
+
+        QnnHtpDevice_Infrastructure_t *htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
+        QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra;
+        uint32_t power_configid = 1;
+        uint32_t device_id = 0;
+        uint32_t core_id = 0;
+        htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid);
+        _qnn_htp_perfinfra = htp_perfinfra;
+        _qnn_power_configid = power_configid;
+
+        return 0;
+    }
+
+    int set_rpc_polling() {
+        if (_qnn_rpc_pollingtime > 0) {
+            QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingtime;
+            memset(&rpc_pollingtime, 0, sizeof(rpc_pollingtime));
+            rpc_pollingtime.option =
+                    QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
+            rpc_pollingtime.rpcPollingTimeConfig = _qnn_rpc_pollingtime;
+            const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&rpc_pollingtime, nullptr};
+            if (_qnn_htp_perfinfra) {
+                _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
+            }
+        }
+        return 0;
+    }
+
+    int set_high_performance_mode() {
+        if (nullptr == _qnn_htp_perfinfra) {
+            GGMLQNN_LOG_DEBUG("perf intra is null\n");
+            return 1;
+        }
+
+        QnnHtpPerfInfrastructure_PowerConfig_t power_config;
+        memset(&power_config, 0, sizeof(power_config));
+        power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
+        power_config.dcvsV3Config.dcvsEnable = 0;
+        power_config.dcvsV3Config.setDcvsEnable = 1;
+        power_config.dcvsV3Config.contextId = _qnn_power_configid;
+        power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
+        power_config.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False
+        power_config.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False
+        power_config.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False
+        power_config.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable
+        power_config.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False
+        // set Sleep latency parameter
+        uint32_t latencyValue = 40;
+        power_config.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec
+        // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
+        power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+        power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+        power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+        // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
+        power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+        power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+        power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+        // set power config with different performance parameters
+        const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&power_config, nullptr};
+
+        _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
+
+        return 0;
+    }
+
+    std::string &get_qnn_graph_name() { return _graph_name; }
+
+    bool is_rpcmem_initialized() {
+        return _rpcmem_initialized;
+    }
+
+    void set_rpcmem_initialized(bool initialized) {
+        _rpcmem_initialized = initialized;
+    }
+
+    size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
+
+    int32_t rpcmem_to_fd(void * buf);
+
+    int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor);
+    Qnn_MemHandle_t  register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type);
+
+    void unregister_rpcmem();
+    void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
+
+    void *alloc_rpcmem(size_t bytes, size_t alignment);
+
+    void free_rpcmem(void * buf);
+
+    bool is_rpcmem_allocated(void * buf);
+
+    bool is_rpcmem_registered(Qnn_MemHandle_t handle) {
+        return _qnn_mem_set.count(handle) != 0U;
+    }
+
+public:
+    std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *, Qnn_Tensor_t *, Qnn_Tensor_t *>> _qnn_graph_map;
+
+private:
+    int load_system();
+
+    int unload_system();
+
+    int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config);
+
+    int unload_backend();
+
+    void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) {
+        _qnn_raw_interface = raw_interface;
+    }
+
+    void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) {
+        _qnn_raw_system_interface = raw_interface;
+    }
+
+private:
+    static constexpr const int _required_num_providers = 1;
+
+private:
+    std::string _lib_path;
+    std::string _backend_name;
+    std::string _model_name;               // name of prebuilt QNN model, might be used in the future
+    BackendIdType _backend_id;
+
+    bool _debug_tensor                      = false; // flag to indicate if requested graph is to be run in debug mode
+    bool _do_node_validations               = true;  // flag to indicate whether all add_node calls need to be validated
+    QnnLog_Level_t _qnn_log_level           = QNN_LOG_LEVEL_DEBUG;
+
+    ggml_qnn_profile_level _profile_level   = ggml_qnn_profile_level::profile_detail;
+
+    qnn_interface _qnn_interface;
+
+    void *_system_lib_handle = nullptr;
+
+    Qnn_GraphHandle_t _qnn_graph_handle = nullptr;
+
+    Qnn_LogHandle_t _qnn_log_handle = nullptr;
+
+    Qnn_ProfileHandle_t _qnn_profile_handle = nullptr;
+
+    Qnn_DeviceHandle_t _qnn_device_handle = nullptr;
+
+    Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
+
+    Qnn_ContextHandle_t _qnn_context_handle = nullptr;
+
+    QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
+
+    QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr;
+    uint32_t _qnn_power_configid = 1;
+    uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing
+
+    QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
+    QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
+
+    std::unordered_set<Qnn_MemHandle_t> _qnn_mem_set;
+    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
+
+    static std::mutex _init_mutex;
+    static std::unordered_map<BackendIdType, void *> _loaded_lib_handle;
+    static std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
+    static std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
+
+    void *_rpc_lib_handle = nullptr;
+    std::atomic_bool _rpcmem_initialized{false};
+    pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
+    pfn_rpc_mem_free _pfn_rpc_mem_free;
+    pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
+    pfn_rpc_mem_init  _pfn_rpc_mem_init;
+    pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
+    std::unordered_map<void *, void *> _rpcmem_store_map;
+    size_t                             _rpcmem_capacity = 512;
+
+    std::string _graph_name;
+    QNNBackend _device_id;
+};
+
+std::mutex qnn_instance::_init_mutex;
+std::unordered_map<qnn_instance::BackendIdType, void *> qnn_instance::_loaded_lib_handle;
+std::unordered_map<std::string, qnn_instance::BackendIdType> qnn_instance::_lib_path_to_backend_id;
+std::unordered_map<qnn_instance::BackendIdType, const QnnInterface_t *> qnn_instance::_loaded_backend;
+
+void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
+    if (!_rpcmem_initialized) {
+        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
+        return nullptr;
+    }
+
+    auto allocate_bytes = static_cast<int32_t>(bytes + alignment);
+    void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes);
+    if (buf == nullptr) {
+        GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
+        return nullptr;
+    }
+
+    auto aligned_buf = reinterpret_cast<void *>(ggmlqnn_align_to(alignment,
+                                                         reinterpret_cast<intptr_t>(buf)));
+    bool status = _rpcmem_store_map.insert(std::pair<void *, void *>(aligned_buf, buf)).second;
+    if (!status) {
+        GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
+        _pfn_rpc_mem_free(buf);
+    }
+
+    return aligned_buf;
+}
+
+void qnn_instance::free_rpcmem(void * buf) {
+    if (!_rpcmem_initialized) {
+        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
+    } else if (0 == _rpcmem_store_map.count(buf)) {
+        GGMLQNN_LOG_WARN("no allocated tensor\n");
+    } else {
+        _pfn_rpc_mem_free(_rpcmem_store_map[buf]);
+        _rpcmem_store_map.erase(buf);
+    }
+}
+
+int32_t qnn_instance::rpcmem_to_fd(void * buf) {
+    int32_t mem_fd = -1;
+    if (!is_rpcmem_initialized()) {
+        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
+    } else {
+        mem_fd = _pfn_rpc_mem_to_fd(buf);
+    }
+
+    return mem_fd;
+}
+
+int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
+    if (nullptr == p_data || (nullptr == p_tensor)) {
+        GGMLQNN_LOG_WARN("invalid param\n");
+        return 1;
+    }
+
+    if (!is_rpcmem_initialized()) {
+        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
+        return 2;
+    }
+
+    if (is_rpcmem_allocated(p_data)) {
+        GGMLQNN_LOG_WARN("rpc memory already allocated\n");
+        //return 3;
+    }
+    if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
+        GGMLQNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
+        return 4;
+    }
+
+    int32_t mem_fd = rpcmem_to_fd(p_data);
+    if (-1 == mem_fd) {
+        GGMLQNN_LOG_WARN("failed to get file descriptor\n");
+        return 5;
+    }
+    GGMLQNN_LOG_DEBUG("mem_fd %d\n", mem_fd);
+    Qnn_MemDescriptor_t descriptor = {
+            {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr},
+            QNN_VER_PTR(*p_tensor)->dataType,
+            QNN_MEM_TYPE_ION,
+            {{mem_fd}}};
+    Qnn_MemHandle_t handle = nullptr;
+    int error = QNN_SUCCESS;
+    error = _qnn_interface.qnn_mem_register(
+            _qnn_context_handle,
+            &descriptor,
+            /*numDescriptors=*/1,
+            &handle);
+    if (error != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error),
+              strerror(error));
+        return 6;
+    } else {
+        GGMLQNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
+    }
+    QNN_VER_PTR(*p_tensor)->memHandle = handle;
+    _qnn_mem_set.insert(handle);
+
+    return 0;
+}
+
+Qnn_MemHandle_t  qnn_instance::register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type) {
+    if (!p_data) {
+        GGMLQNN_LOG_WARN("invalid param");
+        return nullptr;
+    }
+
+    if (!is_rpcmem_initialized()) {
+        GGMLQNN_LOG_WARN("rpc memory not initialized");
+        return nullptr;
+    }
+
+    if (is_rpcmem_registered(p_data)) {
+        GGMLQNN_LOG_WARN("rpc memory already registered");
+        return _qnn_rpc_buffer_to_handles[p_data];
+    }
+
+    auto mem_fd = rpcmem_to_fd(p_data);
+    if (mem_fd == -1) {
+        GGMLQNN_LOG_WARN("failed to get file descriptor");
+        return nullptr;
+    }
+
+    GGMLQNN_LOG_DEBUG("mem_fd %d", mem_fd);
+    Qnn_MemDescriptor_t descriptor = {{rank, dimensions, nullptr}, data_type, QNN_MEM_TYPE_ION, {{mem_fd}}};
+    Qnn_MemHandle_t handle = nullptr;
+    auto error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor,
+            /*numDescriptors=*/1, &handle);
+    if (error != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error));
+        return nullptr;
+    }
+
+    _qnn_rpc_buffer_to_handles.insert({p_data, handle});
+    GGMLQNN_LOG_DEBUG("successfully register shared memory handler: %p", handle);
+    return handle;
+}
+
+void qnn_instance::unregister_rpcmem() {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+
+    if (_qnn_mem_set.empty()) {
+        GGMLQNN_LOG_WARN("no rpcmem registered\n");
+    }
+
+    for (auto &mem_handle : _qnn_mem_set) {
+        error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error));
+        }
+    }
+    _qnn_mem_set.clear();
+}
+
+void qnn_instance::unregister_rpcmem(Qnn_MemHandle_t mem_handle) {
+    Qnn_ErrorHandle_t error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
+    if (error != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error));
+    }
+
+    auto it = std::find_if(_qnn_rpc_buffer_to_handles.begin(), _qnn_rpc_buffer_to_handles.end(),
+                           [mem_handle](const auto &kv) { return kv.second == mem_handle; });
+    if (it == _qnn_rpc_buffer_to_handles.end()) {
+        GGMLQNN_LOG_WARN("failed to find shared memory handler: %p", mem_handle);
+        return;
+    }
+
+    _qnn_rpc_buffer_to_handles.erase(it);
+}
+
+bool qnn_instance::is_rpcmem_allocated(void * buf) {
+    return _rpcmem_store_map.count(buf) != 0U;
+}
+
+int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    GGMLQNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str());
+
+    void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+    if (nullptr == lib_handle) {
+        GGMLQNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror());
+        return 1;
+    }
+
+    auto get_providers =
+            load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(lib_handle,
+                                                          "QnnInterface_getProviders");
+    if (nullptr == get_providers) {
+        GGMLQNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror());
+        return 2;
+    }
+
+    // get QnnInterface Providers
+    std::uint32_t num_providers = 0;
+    const QnnInterface_t **provider_list = nullptr;
+    error = get_providers(&provider_list, &num_providers);
+    if (error != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error));
+        return 3;
+    }
+    GGMLQNN_LOG_DEBUG("num_providers=%d\n", num_providers);
+    if (num_providers != _required_num_providers) {
+        GGMLQNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers);
+        return 4;
+    }
+
+    if (nullptr == provider_list) {
+        GGMLQNN_LOG_WARN("failed to get qnn interface providers\n");
+        return 5;
+    }
+    bool found_valid_interface = false;
+    QNN_INTERFACE_VER_TYPE qnn_interface;
+    for (size_t idx = 0; idx < num_providers; idx++) {
+        if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major &&
+            QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) {
+            found_valid_interface = true;
+            qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME;
+            break;
+        }
+    }
+
+    if (!found_valid_interface) {
+        GGMLQNN_LOG_WARN("unable to find a valid qnn interface\n");
+        return 6;
+    } else {
+        GGMLQNN_LOG_INFO("find a valid qnn interface\n");
+    }
+    set_qnn_raw_interface(qnn_interface);
+
+    BackendIdType backend_id = provider_list[0]->backendId;
+    _lib_path_to_backend_id[lib_path] = backend_id;
+    if (_loaded_backend.count(backend_id) > 0) {
+        GGMLQNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n",
+              lib_path.c_str(), backend_id);
+    }
+    _loaded_backend[backend_id] = provider_list[0];
+    if (_loaded_lib_handle.count(backend_id) > 0) {
+        GGMLQNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]);
+        int dlclose_error = dlclose(_loaded_lib_handle[backend_id]);
+        if (dlclose_error != 0) {
+            GGMLQNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror());
+        }
+    }
+    _loaded_lib_handle[backend_id] = lib_handle;
+    _backend_id = backend_id;
+
+#if 0 //not used in PR, keep them here for further use
+    QnnSaver_Config_t outputdir_cfg;
+    outputdir_cfg.option = QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY;
+    outputdir_cfg.outputDirectory = "/data/local/tmp/";
+    QnnSaver_Config_t backendid_cfg;
+    backendid_cfg.option = QNN_SAVER_CONFIG_OPTION_BACKEND_ID;
+    backendid_cfg.backendId = _backend_id;
+    const QnnSaver_Config_t *saverCfg[] = {&outputdir_cfg, &backendid_cfg, nullptr};
+    if (0 == QnnSaver_initialize(saverCfg)) {
+        GGMLQNN_LOG_INFO("QnnSaver_initialize successfully");
+    } else {
+        GGMLQNN_LOG_WARN("QnnSaver_initialize failure");
+    }
+#endif
+    auto saver_initialize =
+            load_qnn_functionpointers<_pfn_QnnSaver_initialize *>(
+            _loaded_lib_handle[backend_id], "QnnSaver_initialize");
+    if (nullptr != saver_initialize) {
+        error = saver_initialize(saver_config);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to saver_initialize，error %d", QNN_GET_ERROR_CODE(error));
+            return 7;
+        }
+    } else {
+        GGMLQNN_LOG_WARN("saver_initialize is null\n");
+    }
+
+    return 0;
+}
+
+int qnn_instance::unload_backend() {
+    int dlclose_error = 0;
+    for (auto &it : _loaded_lib_handle) {
+        dlclose_error = dlclose(it.second);
+        if (dlclose_error != 0) {
+            GGMLQNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror());
+        }
+    }
+
+    _loaded_lib_handle.clear();
+    _lib_path_to_backend_id.clear();
+    _loaded_backend.clear();
+
+    return 0;
+}
+
+int qnn_instance::load_system() {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+
+    std::string system_lib_path = _lib_path + "libQnnSystem.so";
+    GGMLQNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str());
+
+    _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+    if (nullptr == _system_lib_handle) {
+        GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
+        //re-try with Android APK's internal QNN runtime lib path
+        _lib_path = "/data/data/com.cdeos.kantv/qnnlib/";
+        system_lib_path = _lib_path + "libQnnSystem.so";
+        _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+        if (nullptr == _system_lib_handle) {
+            GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
+            return 1;
+        }
+    }
+
+    auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym(
+            _system_lib_handle, "QnnSystemInterface_getProviders"));
+    if (nullptr == get_providers) {
+        GGMLQNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror());
+        return 2;
+    }
+
+    uint32_t num_providers = 0;
+    const QnnSystemInterface_t ** provider_list = nullptr;
+    error = get_providers(&provider_list, &num_providers);
+    if (error != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error));
+        return 3;
+    }
+
+    if (num_providers != _required_num_providers) {
+        GGMLQNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers);
+        return 4;
+    }
+
+    if (nullptr == provider_list) {
+        GGMLQNN_LOG_WARN("can not get providers\n");
+        return 5;
+    }
+
+    QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface;
+    bool found_valid_system_interface = false;
+    for (size_t idx = 0; idx < num_providers; idx++) {
+        if (QNN_SYSTEM_API_VERSION_MAJOR ==
+            provider_list[idx]->systemApiVersion.major &&
+            QNN_SYSTEM_API_VERSION_MINOR <=
+            provider_list[idx]->systemApiVersion.minor) {
+            found_valid_system_interface = true;
+            qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME;
+            break;
+        }
+    }
+    if (!found_valid_system_interface) {
+        GGMLQNN_LOG_WARN("unable to find a valid qnn system interface\n");
+        return 6;
+    } else {
+        GGMLQNN_LOG_INFO("find a valid qnn system interface\n");
+    }
+    set_qnn_raw_system_interface(qnn_system_interface);
+
+    _qnn_interface.set_qnn_system_interface(provider_list[0]);
+
+    _qnn_interface.qnn_system_context_create(&_qnn_system_handle);
+    if (nullptr == _qnn_system_handle) {
+        GGMLQNN_LOG_WARN("can not create QNN system contenxt\n");
+    } else {
+        GGMLQNN_LOG_INFO("initialize qnn system successfully\n");
+    }
+
+    return 0;
+}
+
+int qnn_instance::unload_system() {
+    int result = 0;
+
+    if (nullptr == _system_lib_handle) {
+        GGMLQNN_LOG_DEBUG("system lib handle is null\n");
+        return 1;
+    }
+
+    if (nullptr != _qnn_system_handle) {
+        result = _qnn_interface.qnn_system_context_free(_qnn_system_handle);
+        if (result != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to free QNN system context\n");
+        }
+        _qnn_system_handle = nullptr;
+    }
+
+    int dlclose_error = dlclose(_system_lib_handle);
+    if (dlclose_error != 0) {
+        GGMLQNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror());
+        return 2;
+    }
+
+    _system_lib_handle = nullptr;
+
+    return result;
+}
+
+static void ggml_qnn_logcallback(const char * fmt,
+                                 QnnLog_Level_t level,
+                                 uint64_t timestamp,
+                                 va_list argp) {
+
+    static std::mutex log_mutex;
+    static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN];
+
+    const char * log_level_desc = "";
+    switch (level) {
+        case QNN_LOG_LEVEL_ERROR:
+            log_level_desc = " ERROR ";
+            break;
+        case QNN_LOG_LEVEL_WARN:
+            log_level_desc = "WARNING";
+            break;
+        case QNN_LOG_LEVEL_INFO:
+            log_level_desc = "  INFO ";
+            break;
+        case QNN_LOG_LEVEL_DEBUG:
+            log_level_desc = " DEBUG ";
+            break;
+        case QNN_LOG_LEVEL_VERBOSE:
+            log_level_desc = "VERBOSE";
+            break;
+        case QNN_LOG_LEVEL_MAX:
+            log_level_desc = "UNKNOWN";
+            break;
+    }
+
+    double ms = (double) timestamp / 1000000.0;
+
+    {
+        std::lock_guard<std::mutex> lock(log_mutex);
+
+        memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN);
+        vsnprintf(reinterpret_cast<char *const>(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp);
+#if GGMLQNN_PRINT_QNN_INTERNAL_LOG
+        GGMLQNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
+#endif
+    }
+}
+
+int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
+    BackendIdType backend_id = QNN_BACKEND_ID_NULL;
+    GGMLQNN_LOG_DEBUG("enter qni_init\n");
+
+    const std::lock_guard<std::mutex> lock(_init_mutex);
+
+    if (0 != load_system()) {
+        GGMLQNN_LOG_WARN("can not load QNN system lib, pls check why?\n");
+        return 1;
+    } else {
+        GGMLQNN_LOG_DEBUG("load QNN system lib successfully\n");
+    }
+
+    std::string bakend_lib_path = _lib_path + _backend_name;
+    if (0 == _lib_path_to_backend_id.count(bakend_lib_path)) {
+        int is_load_ok = load_backend(bakend_lib_path, saver_config);
+        if (0 != is_load_ok) {
+            GGMLQNN_LOG_WARN("failed to load QNN backend\n");
+            return 2;
+        }
+    }
+
+    backend_id = _lib_path_to_backend_id[bakend_lib_path];
+    if (0 == _loaded_backend.count(backend_id) ||
+        0 == _loaded_lib_handle.count(backend_id)) {
+        GGMLQNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, loaded lib_handle count=%zu\n",
+              bakend_lib_path.c_str(),
+              _loaded_backend.count(backend_id),
+              _loaded_lib_handle.count(backend_id));
+        return 3;
+    }
+
+    _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]);
+
+#if 1
+    _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle);
+#else
+    _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle);
+#endif
+    if (nullptr == _qnn_log_handle) {
+        GGMLQNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone
+        return 4;
+    } else {
+        GGMLQNN_LOG_DEBUG("initialize qnn log successfully\n");
+    }
+
+    std::vector<const QnnBackend_Config_t *> temp_backend_config;
+    _qnn_interface.qnn_backend_create(_qnn_log_handle,
+                      temp_backend_config.empty() ? nullptr : temp_backend_config.data(),
+                      &_qnn_backend_handle);
+    if (nullptr == _qnn_backend_handle) {
+        GGMLQNN_LOG_WARN("why failed to initialize qnn backend\n");
+        return 5;
+    } else {
+        GGMLQNN_LOG_DEBUG("initialize qnn backend successfully\n");
+    }
+
+    if (nullptr != _qnn_raw_interface.propertyHasCapability) {
+        auto qnnstatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE);
+        if (QNN_PROPERTY_NOT_SUPPORTED == qnnstatus) {
+            GGMLQNN_LOG_WARN("device property is not supported\n");
+        }
+        if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnstatus) {
+            GGMLQNN_LOG_WARN("device property is not known to backend\n");
+        }
+    }
+
+    auto qnnstatus = _qnn_raw_interface.deviceCreate(
+            _qnn_log_handle, nullptr, &_qnn_device_handle);
+    if (QNN_SUCCESS != qnnstatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnstatus) {
+        GGMLQNN_LOG_WARN("failed to create QNN device\n");
+    } else {
+        GGMLQNN_LOG_INFO("create device successfully\n");
+    }
+
+    if (ggml_qnn_profile_level::profile_off != _profile_level) {
+        GGMLQNN_LOG_INFO("profiling turned on; level = %d", _profile_level);
+        if (ggml_qnn_profile_level::profile_basic == _profile_level) {
+            GGMLQNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n");
+            if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
+                    _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) {
+                GGMLQNN_LOG_WARN("unable to create profile handle in the backend\n");
+                return 7;
+            } else {
+                GGMLQNN_LOG_DEBUG("initialize qnn profile successfully\n");
+            }
+        } else if (ggml_qnn_profile_level::profile_detail == _profile_level) {
+            GGMLQNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n");
+            if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
+                    _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) {
+                GGMLQNN_LOG_WARN("unable to create profile handle in the backend\n");
+                return 7;
+            } else {
+                GGMLQNN_LOG_DEBUG("initialize qnn profile successfully\n");
+            }
+        }
+    }
+
+    _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+    if (nullptr == _rpc_lib_handle) {
+        GGMLQNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror());
+        return 9;
+    } else {
+        GGMLQNN_LOG_DEBUG("load rpcmem lib successfully\n");
+        set_rpcmem_initialized(true);
+    }
+    _pfn_rpc_mem_init   = reinterpret_cast<pfn_rpc_mem_init>(dlsym(_rpc_lib_handle, "rpcmem_init"));
+    _pfn_rpc_mem_deinit = reinterpret_cast<pfn_rpc_mem_deinit>(dlsym(_rpc_lib_handle, "rpcmem_deinit"));
+    _pfn_rpc_mem_alloc  = reinterpret_cast<pfn_rpc_mem_alloc>(dlsym(_rpc_lib_handle,"rpcmem_alloc"));
+    _pfn_rpc_mem_free   = reinterpret_cast<pfn_rpc_mem_free>(dlsym(_rpc_lib_handle, "rpcmem_free"));
+    _pfn_rpc_mem_to_fd  = reinterpret_cast<pfn_rpc_mem_to_fd>(dlsym(_rpc_lib_handle,"rpcmem_to_fd"));
+    if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free
+        || nullptr == _pfn_rpc_mem_to_fd) {
+        GGMLQNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror());
+        dlclose(_rpc_lib_handle);
+        return 10;
+    }
+
+    if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy
+        _pfn_rpc_mem_init();
+
+    std::vector<const QnnContext_Config_t *> temp_context_config;
+    _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle,
+                               temp_context_config.empty() ? nullptr : temp_context_config.data(),
+                               &_qnn_context_handle);
+    if (nullptr == _qnn_context_handle) {
+        GGMLQNN_LOG_WARN("why failed to initialize qnn context\n");
+        return 8;
+    } else {
+        GGMLQNN_LOG_DEBUG("initialize qnn context successfully\n");
+    }
+
+    if (_backend_name.find("Htp") != std::variant_npos) {
+        const QnnDevice_PlatformInfo_t * p_info = nullptr;
+        _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
+        GGMLQNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices);
+        QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
+        for (int i = 0; i < p_info->v1.numHwDevices; i++) {
+            GGMLQNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId,
+                         infos[i].v1.deviceType, infos[i].v1.numCores);
+            QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
+            QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice;
+            QnnHtpDevice_Arch_t htp_arch = chipinfo.arch;
+            GGMLQNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType,
+                             (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "QNN_HTP_DEVICE_TYPE_ON_CHIP" : "QNN_HTP_DEVICE_TYPE_UNKNOWN");
+            GGMLQNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \
+                             chipinfo.socModel, qnn_get_socmodel_desc(chipinfo.socModel), \
+                             htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize);
+            struct qcom_socinfo * socinfo = qnn_get_socinfo_from_socmodel(chipinfo.socModel);
+            g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize };
+            if (nullptr != socinfo) {
+                memcpy(g_qnn_mgr[QNN_BACKEND_NPU].socinfo.soc_desc, socinfo->soc_desc, sizeof(socinfo->soc_desc));
+                GGMLQNN_LOG_INFO("soc info:%s", socinfo->soc_desc);
+            } else {
+                memcpy(g_qnn_mgr[QNN_BACKEND_NPU].socinfo.soc_desc, "unknown", 7);
+                GGMLQNN_LOG_INFO("soc info:unknown");
+            }
+        }
+        _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
+
+        //TODO: faster approach to probe the accurate capacity of QNN RPC ion memory
+        size_t candidate_size = 0;
+        uint8_t * rpc_buffer = nullptr;
+        const int SIZE_IN_MB = (1 << 20);
+        size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
+        size_t probe_counts  = sizeof(probe_slots) / sizeof(size_t);
+        for (size_t idx = 0; idx < probe_counts; idx++) {
+            rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4));
+            if (nullptr == rpc_buffer) {
+                GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
+                break;
+            } else {
+                candidate_size = probe_slots[idx];
+                free_rpcmem(rpc_buffer);
+                rpc_buffer = nullptr;
+            }
+        }
+        if (candidate_size > _rpcmem_capacity)
+            _rpcmem_capacity = candidate_size;
+        GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
+
+        if (0 != init_htp_perfinfra()) {
+            GGMLQNN_LOG_WARN("initialize HTP performance failure");
+        }
+        if (0 != set_rpc_polling()) {
+            GGMLQNN_LOG_WARN("set RPC polling failure");
+        }
+        if (0 != set_high_performance_mode()) {
+            GGMLQNN_LOG_WARN("set HTP high performance mode failure");
+        }
+    }
+
+    GGMLQNN_LOG_DEBUG("leave qni_init\n");
+
+    return 0;
+}
+
+int qnn_instance::qnn_finalize() {
+    int ret_status = 0;
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+
+    //FIXME:should be removed in the future
+    reset_idx();
+
+    if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's mobile SoC equipped low-end phone happy
+        _pfn_rpc_mem_deinit();
+
+    if (dlclose(_rpc_lib_handle) != 0) {
+        GGMLQNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror());
+    } else {
+        GGMLQNN_LOG_DEBUG("succeed to close rpcmem lib\n");
+    }
+
+    if (nullptr != _qnn_context_handle) {
+        error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+
+        }
+        _qnn_context_handle = nullptr;
+    }
+
+    if (nullptr != _qnn_profile_handle) {
+        error = _qnn_interface.qnn_profile_free(_qnn_profile_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+
+        }
+        _qnn_profile_handle = nullptr;
+    }
+
+    if (nullptr != _qnn_device_handle) {
+        error = _qnn_interface.qnn_device_free(_qnn_device_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+
+        }
+        _qnn_device_handle = nullptr;
+    }
+
+    if (nullptr != _qnn_backend_handle) {
+        error = _qnn_interface.qnn_backend_free(_qnn_backend_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+        }
+        _qnn_backend_handle = nullptr;
+
+    }
+
+    if (nullptr != _qnn_log_handle) {
+        error = _qnn_interface.qnn_log_free(_qnn_log_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+        }
+        _qnn_log_handle = nullptr;
+    }
+
+    unload_backend();
+
+    unload_system();
+
+    return ret_status;
+}
+
+int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb) {
+    _graph_name = graph_name;
+    _device_id = device;
+
+    GGMLQNN_LOG_DEBUG("[%s][%s]created", ggml_backend_qnn_get_devname(device), graph_name.c_str());
+
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    Qnn_GraphHandle_t graph_handle = nullptr;
+    if (device == QNN_BACKEND_NPU) {
+        QnnHtpGraph_CustomConfig_t hvx_config;
+        hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
+        hvx_config.numHvxThreads = 8;
+        QnnGraph_Config_t graph_hvx_config;
+        graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_hvx_config.customConfig = &hvx_config;
+
+        QnnHtpGraph_CustomConfig_t dlbc_config;
+        dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+        dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
+        dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
+        QnnGraph_Config_t graph_dlbc_config;
+        graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_dlbc_config.customConfig = &dlbc_config;
+
+        QnnHtpGraph_CustomConfig_t opt_config;
+        opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
+        opt_config.optimizationOption.floatValue = 1; // 1 / 3
+        QnnGraph_Config_t graph_opt_config;
+        graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_opt_config.customConfig = &opt_config;
+
+        QnnHtpGraph_CustomConfig_t vtcm_config;
+        vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
+        vtcm_config.vtcmSizeInMB = vtcm_size_in_mb;
+        QnnGraph_Config_t graph_vtcm_config;
+        graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_vtcm_config.customConfig = &vtcm_config;
+
+        const QnnGraph_Config_t * graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config,
+                                                    &graph_opt_config, nullptr};
+        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), graph_configs, &graph_handle);
+    } else {
+        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), nullptr, &graph_handle);
+    }
+
+    if (error != QNN_SUCCESS) {
+        GGMLQNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s",
+                      ggml_backend_qnn_get_devname(device), graph_name.c_str(),
+                      qnn_get_error_string(error));
+        return error;
+    }
+
+    GGMLQNN_LOG_INFO("[%s]create graph %s succeed", ggml_backend_qnn_get_devname(device), graph_name.c_str());
+    _qnn_graph_handle = graph_handle;
+    return QNN_SUCCESS;
+}
+
+int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation,
+                                   const QnnGraph_Config_t ** graph_configs) {
+    int result = 0;
+
+    if (nullptr == graph_name) {
+        GGMLQNN_LOG_WARN("graph name is null\n");
+        return 1;
+    }
+
+    if (!_graph_name.empty()) {
+        GGMLQNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name);
+        return 2;
+    }
+
+    if (!do_node_validation) {
+        GGMLQNN_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n");
+    }
+
+    _graph_name = graph_name;
+    _debug_tensor = debug;
+    _do_node_validations = do_node_validation;
+
+    result = _qnn_raw_interface.graphCreate(_qnn_context_handle,
+                                            graph_name,
+                                            graph_configs,
+                                            &_qnn_graph_handle);
+    if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) {
+        GGMLQNN_LOG_WARN("failed to create graph in qnn context\n");
+        return 3;
+    } else {
+        GGMLQNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle);
+    }
+
+    return 0;
+}
+
+int qnn_instance::finalize_qnn_graph() {
+    if (nullptr != _qnn_graph_handle) {
+        if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle,
+                                             _qnn_profile_handle, nullptr)
+                                             != QNN_GRAPH_NO_ERROR) {
+            GGMLQNN_LOG_WARN("finalizing graph failure\n");
+            return 1;
+        }
+    } else {
+        GGMLQNN_LOG_DEBUG("qnn graph handle is null\n");
+    }
+
+    return 0;
+}
+
+// =================================================================================================
+//  section-6: implementation of ggml-qnn backend
+// =================================================================================================
+static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dump_tensor_info) {
+    if (tensor->op == GGML_OP_NONE) {
+        return true;
+    }
+    if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE
+    || tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW
+    || tensor->op == GGML_OP_PERMUTE) {
+        return false;
+    }
+
+    bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT));
+    if (!supported_op) {
+        return false;
+    }
+
+    struct ggml_tensor * src0 = tensor->src[0];
+    struct ggml_tensor * src1 = tensor->src[1];
+
+    int64_t ne00 = tensor->src[0]->ne[0];
+    int64_t ne01 = tensor->src[0]->ne[1];
+
+    int64_t ne10 = tensor->src[1]->ne[0];
+    int64_t ne11 = tensor->src[1]->ne[1];
+
+    int64_t ne0 = tensor->ne[0];
+    int64_t ne1 = tensor->ne[1];
+
+    if (tensor->op == GGML_OP_ADD) {
+        if (!ggml_are_same_shape(src0, src1)) {
+            return false;
+        }
+#if GGMLQNN_PRINT_OP_ADD_LOG
+        if (b_dump_tensor_info) {
+            GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op),
+                              ggml_type_name(tensor->type));
+            GGMLQNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type));
+            GGMLQNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type));
+            GGMLQNN_LOG_DEBUG("GGML_OP_ADD");
+            GGMLQNN_LOG_DEBUG(
+                    "src0 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                    src0->name,
+                    src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
+                    src0->nb[0], src0->nb[1], src0->nb[2]);
+            GGMLQNN_LOG_DEBUG(
+                    "src1 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                    src1->name,
+                    src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
+                    src1->nb[0], src1->nb[1], src1->nb[2]);
+            GGMLQNN_LOG_DEBUG(
+                    "     %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                    tensor->name,
+                    tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1],
+                    tensor->ne[2],
+                    tensor->nb[0],
+                    tensor->nb[1], tensor->nb[2]);
+
+        }
+#endif
+        return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
+               && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
+
+    }
+
+    if (tensor->op == GGML_OP_MUL_MAT) {
+#if GGMLQNN_PRINT_OP_MUL_MAT_LOG
+        if (b_dump_tensor_info) {
+            GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op),
+                              ggml_type_name(tensor->type));
+            GGMLQNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type));
+            GGMLQNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type));
+            GGMLQNN_LOG_DEBUG("dst  type:%s", ggml_type_name(tensor->type));
+            GGMLQNN_LOG_DEBUG(
+                    "src0 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                    src0->name,
+                    src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
+                    src0->nb[0], src0->nb[1], src0->nb[2]);
+            GGMLQNN_LOG_DEBUG(
+                    "src1 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                    src1->name,
+                    src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
+                    src1->nb[0], src1->nb[1], src1->nb[2]);
+            GGMLQNN_LOG_DEBUG(
+                    "dst  %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                    tensor->name,
+                    tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1],
+                    tensor->ne[2],
+                    tensor->nb[0],
+                    tensor->nb[1], tensor->nb[2]);
+
+        }
+#endif
+        //FIXME: 2048 is an experimental value between ASR inference and LLM inference because
+        //       it's better only offload big matrix to QNN backend
+        if (ne01 <= 2048) {
+            return false;
+        }
+#if 0
+        //TODO: offload mul_mat to QNN backend
+        //we need to process type traint in func ggml_qnn_mul_mat(...) with following case:
+        //src0: q4_0, q6_k
+        //src1: f32
+        //dst : f32
+        return (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
+                && (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16);
+#else
+        //passthrough mul_mat
+        return  (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
+                && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
+                && (src0->type == src1->type) && (src0->type == tensor->type);
+#endif
+    }
+
+    //TODO:for other op
+    return  (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
+            && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
+            && (src0->type == src1->type) && (src0->type == tensor->type);
+}
+
+static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
+    enum ggml_status result                     = GGML_STATUS_SUCCESS;
+    bool graph_initialized                      = false;
+    qnn_instance * instance                     = nullptr;
+    ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *)backend->context;
+    std::string graph_name                      = "ggml_op_qnn_add";
+    qnn_perf op_perf                            = qnn_perf("ggml_qnn_add");
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    Qnn_Tensor_t * tensor_0                     = nullptr;
+    Qnn_Tensor_t * tensor_1                     = nullptr;
+    Qnn_Tensor_t * tensor_2                     = nullptr;
+    Qnn_Param_t qnn_params[]                    = {};
+    enum ggml_op ggmlop                         = GGML_OP_ADD;
+    Qnn_DataType_t src0_qnn_type                = QNN_DATATYPE_FLOAT_32;
+    Qnn_DataType_t src1_qnn_type                = QNN_DATATYPE_FLOAT_32;
+    Qnn_DataType_t dst_qnn_type                 = QNN_DATATYPE_FLOAT_32;
+
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+
+    const ggml_tensor * src0 = op->src[0];
+    const ggml_tensor * src1 = op->src[1];
+    ggml_tensor * dst        = op;
+    op_perf.start();
+
+    std::string map_entry;
+    get_graph_key_from_op(op, map_entry);
+    if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) {
+        graph_initialized = true;
+        auto & graph_item = instance->_qnn_graph_map[map_entry];
+        graph_handle = std::get<0>(graph_item);
+        tensor_0     = std::get<1>(graph_item);
+        tensor_1     = std::get<2>(graph_item);
+        tensor_2     = std::get<3>(graph_item);
+    } else {
+        tensor_0 = ggml_qnn_create_tensor(src0);
+        tensor_1 = ggml_qnn_create_tensor(src1);
+        tensor_2 = ggml_qnn_create_tensor(dst);
+    }
+
+//#if GGMLQNN_DEBUG //uncomment this line and comment next line when troubleshooting mul_mat issue
+#if GGMLQNN_PRINT_OP_ADD_LOG
+    GGMLQNN_LOG_DEBUG("call %s in dev %s\n", __func__, ctx->name);
+    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+          src0->name,
+          src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
+          src0->nb[0], src0->nb[1], src0->nb[2]);
+    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+          src1->name,
+          src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
+          src1->nb[0], src1->nb[1], src1->nb[2]);
+    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+          dst->name,
+          dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
+          dst->nb[1], dst->nb[2]);
+    GGMLQNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
+    GGMLQNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0));
+    GGMLQNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1));
+    GGMLQNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2));
+#endif
+
+    QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
+
+    src0_qnn_type                   = qnn_datatype_from_ggml_datatype(src0->type);
+    src1_qnn_type                   = qnn_datatype_from_ggml_datatype(src1->type);
+    dst_qnn_type                    = qnn_datatype_from_ggml_datatype(dst->type);
+
+    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions;
+    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
+    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
+
+    if (!graph_initialized) {
+        graph_name = map_entry;
+        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
+        if (ctx->device == QNN_BACKEND_NPU) {
+            QnnHtpGraph_CustomConfig_t hvx_config;
+            hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
+            hvx_config.numHvxThreads = 4;
+            QnnGraph_Config_t graph_hvx_config;
+            graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_hvx_config.customConfig = &hvx_config;
+
+            QnnHtpGraph_CustomConfig_t dlbc_config;
+            dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+            dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
+            dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
+            QnnGraph_Config_t graph_dlbc_config;
+            graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_dlbc_config.customConfig = &dlbc_config;
+
+            QnnHtpGraph_CustomConfig_t opt_config;
+            opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
+            opt_config.optimizationOption.floatValue = 3;    // 1 or 3
+            QnnGraph_Config_t graph_opt_config;
+            graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_opt_config.customConfig = &opt_config;
+
+            QnnHtpGraph_CustomConfig_t vtcm_config;
+            vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
+            vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
+            QnnGraph_Config_t graph_vtcm_config;
+            graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_vtcm_config.customConfig = &vtcm_config;
+
+            QnnHtpGraph_CustomConfig_t precision_config;
+            precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
+            precision_config.precision = QNN_PRECISION_FLOAT16;
+            QnnGraph_Config_t graph_precision_config;
+            graph_precision_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_precision_config.customConfig = &precision_config;
+
+            const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config,
+                                                        &graph_dlbc_config,
+                                                        &graph_vtcm_config,
+                                                        &graph_opt_config,
+                                                        &graph_precision_config,
+                                                        NULL};
+            error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
+                                                  graph_name.c_str(),
+                                                  p_graphconfig, &graph_handle);
+        } else {
+            error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
+                                                  graph_name.c_str(),
+                                                  nullptr, &graph_handle);
+        }
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
+            return;
+        }
+        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("error = %d\n", error);
+        }
+        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("error = %d\n", error);
+        }
+        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("error = %d\n", error);
+        }
+
+        QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+        QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *tensor_0,
+                *tensor_1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *tensor_2
+        };
+        Qnn_OpConfig_t op_config = {
+                (Qnn_OpConfigVersion_t) 1, .v1 = {
+                        "ggml_op_add",
+                        QNN_OP_PACKAGE_NAME_QTI_AISW,
+                        QNN_OP_ELEMENT_WISE_ADD,
+                        0,
+                        qnn_params,
+                        2,
+                        tensor_inputs,
+                        1,
+                        tensor_outputs
+                }
+        };
+        error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("error = %d\n", error);
+        }
+        error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("error = %d\n", error);
+        }
+        error = qnn_raw_interface.graphExecute(graph_handle,
+                                               tensor_inputs, 2,
+                                               tensor_outputs, 1,
+                                               nullptr, nullptr);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("error = %d\n", error);
+        }
+        auto  graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
+        instance->_qnn_graph_map[map_entry] = graph_item;
+    } else {
+        uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+                                         (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
+        uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
+                                         (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
+        uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+                                         (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
+        QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
+        QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0);
+        QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
+        QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
+        QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1);
+        QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
+        QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
+        QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst);
+        QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
+
+        QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+        QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *tensor_0,
+                *tensor_1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *tensor_2
+        };
+        error = qnn_raw_interface.graphExecute(graph_handle,
+                                               tensor_inputs, 2,
+                                               tensor_outputs, 1,
+                                               nullptr, nullptr);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("error = %d\n", error);
+        }
+    }
+
+    //avoid memory leak in func free_qnn_tensor
+    QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions;
+    QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
+    QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions;
+#if GGMLQNN_PRINT_OP_ADD_LOG
+    op_perf.info();
+#endif
+}
+
+//TODO: type trait with op->src[0]
+/*
+ * the procedure of ggml_qnn_mul_mat is similar to ggml_qnn_add,but there are type trait process
+ * for ggml_qnn_mul_mat, so it's a standalone function.
+ *
+ * MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, we should focus on MUL_MAT.
+ *
+ * we have three kinds of MUL_MAT to compute:
+ * mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
+ * mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
+ * mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, ...) and src1 is F32, quantize in src0 -> f32 in src0', then src0' * src1
+*/
+static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
+    bool graph_initialized                      = false;
+    qnn_perf op_perf                            = qnn_perf("ggml_qnn_mul_mat");
+    qnn_instance * instance                     = nullptr;
+    ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *) backend->context;
+
+    std::string graph_name                      = "ggml_op_qnn_mul_mat";
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    Qnn_Tensor_t * tensor_0                     = nullptr;
+    Qnn_Tensor_t * tensor_1                     = nullptr;
+    Qnn_Tensor_t * tensor_2                     = nullptr;
+
+    Qnn_Param_t qnn_params[]                    = {};
+
+    enum ggml_op ggmlop                         = GGML_OP_ADD;
+    Qnn_DataType_t src0_qnn_type                = QNN_DATATYPE_FLOAT_32;
+    Qnn_DataType_t src1_qnn_type                = QNN_DATATYPE_FLOAT_32;
+    Qnn_DataType_t dst_qnn_type                 = QNN_DATATYPE_FLOAT_32;
+
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+
+    const ggml_tensor * src0 = op->src[0];
+    const ggml_tensor * src1 = op->src[1];
+    ggml_tensor * dst        = op;
+    op_perf.start();
+
+    std::string map_entry;
+    get_graph_key_from_op(op, map_entry);
+    if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) {
+        graph_initialized = true;
+        auto & graph_item = instance->_qnn_graph_map[map_entry];
+        graph_handle = std::get<0>(graph_item);
+        tensor_0     = std::get<1>(graph_item);
+        tensor_1     = std::get<2>(graph_item);
+        tensor_2     = std::get<3>(graph_item);
+    } else {
+        tensor_0 = ggml_qnn_create_tensor(src0);
+        tensor_1 = ggml_qnn_create_tensor(src1);
+        tensor_2 = ggml_qnn_create_tensor(dst);
+    }
+
+#if GGMLQNN_DEBUG
+    GGMLQNN_LOG_DEBUG("call %s in dev %s\n", __func__, ctx->name);
+    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+          src0->name,
+          src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
+          src0->nb[0], src0->nb[1], src0->nb[2]);
+    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+          src1->name,
+          src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
+          src1->nb[0], src1->nb[1], src1->nb[2]);
+    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+          dst->name,
+          dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
+          dst->nb[1], dst->nb[2]);
+    GGMLQNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
+    GGMLQNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0));
+    GGMLQNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1));
+    GGMLQNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2));
+#endif
+    QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
+
+    src0_qnn_type                   = qnn_datatype_from_ggml_datatype(src0->type);
+    src1_qnn_type                   = qnn_datatype_from_ggml_datatype(src1->type);
+    dst_qnn_type                    = qnn_datatype_from_ggml_datatype(dst->type);
+
+    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions;
+    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
+    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
+
+    if (!graph_initialized) {
+        graph_name = map_entry;
+        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
+        error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
+                                              graph_name.c_str(), nullptr, &graph_handle);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
+            return;
+        }
+        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("error = %d\n", error);
+        }
+        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("error = %d\n", error);
+        }
+        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("error = %d\n", error);
+        }
+
+        QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+        QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *tensor_0,
+                *tensor_1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *tensor_2
+        };
+        Qnn_OpConfig_t op_config = {
+                (Qnn_OpConfigVersion_t) 1, .v1 = {
+                        "ggml_op_mul_mat",
+                        QNN_OP_PACKAGE_NAME_QTI_AISW,
+                        QNN_OP_MAT_MUL,
+                        0,
+                        qnn_params,
+                        2,
+                        tensor_inputs,
+                        1,
+                        tensor_outputs
+                }
+        };
+        error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("error = %d\n", error);
+        }
+        error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("error = %d\n", error);
+        }
+        error = qnn_raw_interface.graphExecute(graph_handle,
+                                               tensor_inputs, 2,
+                                               tensor_outputs, 1,
+                                               nullptr, nullptr);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("error = %d\n", error);
+        }
+        auto  graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
+        instance->_qnn_graph_map[map_entry] = graph_item;
+    } else {
+        uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+                                         (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
+        uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
+                                         (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
+        uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+                                         (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
+        QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
+        QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0);
+        QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
+        QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
+        QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1);
+        QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
+        QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
+        QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst);
+        QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
+
+        QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+        QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *tensor_0,
+                *tensor_1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *tensor_2
+        };
+        error = qnn_raw_interface.graphExecute(graph_handle,
+                                              tensor_inputs, 2,
+                                             tensor_outputs, 1,
+                                         nullptr, nullptr);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("error = %d\n", error);
+        }
+    }
+
+    //avoid memory leak in func free_qnn_tensor
+    QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions;
+    QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
+    QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions;
+
+    op_perf.info();
+}
+
+static bool ggml_qnn_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor) {
+    ggmlqnn_op_func_t func                = nullptr;
+
+    switch (tensor->op) {
+        case GGML_OP_ADD:
+            func = ggml_qnn_add;
+            break;
+
+        case GGML_OP_MUL_MAT:
+            func = ggml_qnn_mul_mat;
+            break;
+
+        default:
+            return false;
+    }
+
+    if (nullptr != func)
+        func(backend, tensor);
+
+    return true;
+}
+
+struct ggml_backend_qnn_buffer_context {
+    ~ggml_backend_qnn_buffer_context() {
+        if (buffer) {
+            free(buffer);
+        }
+
+        for (auto * sub_buffer : sub_buffers) {
+            free(sub_buffer);
+        }
+
+        for (auto * qnn_tensor : qnn_tensors) {
+            free_qnn_tensor(qnn_tensor);
+        }
+
+        sub_buffers.clear();
+        qnn_tensors.clear();
+    }
+    void * buffer       = nullptr;
+
+    struct ggml_backend_qnn_context * backend_ctx = nullptr;
+
+    size_t buffer_size  = 0;
+    std::vector<void *> sub_buffers;
+    std::vector<Qnn_Tensor_t *> qnn_tensors;
+};
+
+static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
+    delete ctx;
+}
+
+static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
+
+    return ctx->buffer;
+}
+
+static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
+    GGML_UNUSED(error);
+    GGML_UNUSED(ctx);
+    return;
+}
+
+static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                               ggml_tensor * tensor, const void * data,
+                                               size_t offset, size_t size) {
+    GGML_UNUSED(buffer);
+
+    memcpy((char *)tensor->data + offset, data, size);
+}
+
+static void ggml_backend_qnn_buffer_memset_tensor(ggml_backend_buffer_t buffer,
+                                                  struct ggml_tensor * tensor,
+                                                  uint8_t value, size_t offset, size_t size) {
+    GGML_UNUSED(buffer);
+    memset((char *)tensor->data + offset, value, size);
+}
+
+static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                               const ggml_tensor * tensor,
+                                               void * data, size_t offset, size_t size) {
+    GGML_UNUSED(buffer);
+    memcpy(data, (const char *)tensor->data + offset, size);
+}
+
+static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
+                                               const struct ggml_tensor * src,
+                                               struct ggml_tensor * dst) {
+    GGML_UNUSED(buffer);
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        memcpy(dst->data, src->data, ggml_nbytes(src));
+        return true;
+    }
+
+    return false;
+}
+
+static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
+    memset(ctx->buffer, value, ctx->buffer_size);
+}
+
+[[maybe_unused]]static void ggml_backend_qnn_buffer_reset(ggml_backend_buffer_t buffer) {
+    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
+    for (auto * sub_buffer : ctx->sub_buffers) {
+        free(sub_buffer);
+    }
+    ctx->sub_buffers.clear();
+}
+
+static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = {
+        /* .free_buffer     = */ ggml_backend_qnn_buffer_free_buffer,
+        /* .get_base        = */ ggml_backend_qnn_buffer_get_base,
+        /* .init_tensor     = */ ggml_backend_qnn_buffer_init_tensor,
+        /* .memset_tensor   = */ ggml_backend_qnn_buffer_memset_tensor,
+        /* .set_tensor      = */ ggml_backend_qnn_buffer_set_tensor,
+        /* .get_tensor      = */ ggml_backend_qnn_buffer_get_tensor,
+        /* .cpy_tensor      = */ ggml_backend_qnn_buffer_cpy_tensor,
+        /* .clear           = */ ggml_backend_qnn_buffer_clear,
+        /* .reset           = */ NULL,
+};
+
+static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return "qnn-buffer";
+}
+
+static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(
+                                  ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context;
+
+    size_t size_page = sysconf(_SC_PAGESIZE);
+    size_t size_aligned = size;
+    if ((size_aligned % size_page) != 0) {
+        size_aligned += (size_page - (size_aligned % size_page));
+    }
+    ctx->buffer         = ggmlqnn_host_malloc(size_aligned);
+    ctx->buffer_size    = size_aligned;
+    if (nullptr == ctx->buffer) {
+        GGMLQNN_LOG_WARN("%s: failed to allocate %.2f MiB\n", __func__, size / (1 << 20));
+        return nullptr;
+    }
+
+    return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    return 32;
+}
+
+//FIXME: this value is an experimental value on Xiaomi14
+static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+
+    return (2 * (1 << 30));
+}
+
+static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    return true;
+}
+
+static const char * ggml_backend_qnn_name(ggml_backend_t backend) {
+    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context;
+    return g_qnn_mgr[ctx->device].name;
+}
+
+static void ggml_backend_qnn_free(ggml_backend_t backend) {
+    GGMLQNN_LOG_DEBUG("enter %s", __func__ );
+    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context;
+    GGMLQNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name);
+
+    qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance;
+    if (instance != nullptr) {
+        std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *,
+                                        Qnn_Tensor_t *, Qnn_Tensor_t *>>::iterator graph_it;
+
+        for (graph_it = instance->_qnn_graph_map.begin();
+             graph_it != instance->_qnn_graph_map.end(); graph_it++) {
+            auto & graph_item = graph_it->second;
+            Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item);
+            Qnn_Tensor_t *  tensor_0     = std::get<1>(graph_item);
+            Qnn_Tensor_t *  tensor_1     = std::get<2>(graph_item);
+            Qnn_Tensor_t *  tensor_2     = std::get<3>(graph_item);
+            GGML_UNUSED(graph_handle);
+            GGMLQNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str());
+            free_qnn_tensor(tensor_0);
+            free_qnn_tensor(tensor_1);
+            free_qnn_tensor(tensor_2);
+        }
+        instance->_qnn_graph_map.clear();
+
+        instance->qnn_finalize();
+        delete instance;
+        g_qnn_mgr[ctx->device].instance = nullptr;
+    }
+
+    if (g_qnn_mgr[ctx->device].backend != nullptr) {
+        delete backend;
+        g_qnn_mgr[ctx->device].backend = nullptr;
+    }
+    GGMLQNN_LOG_DEBUG("leave %s", __func__ );
+}
+
+static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    enum ggml_status result         = GGML_STATUS_SUCCESS;
+    ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *) backend->context;
+    GGML_UNUSED(ctx);
+
+    //GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE
+        || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW
+        || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+            continue;
+        }
+        bool ok = ggml_qnn_compute_forward(backend, node);
+        if (!ok) {
+            GGMLQNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n",
+                              __func__, node->name, ggml_op_name(node->op));
+        }
+    }
+
+    return result;
+}
+
+static const char * ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) {
+    struct ggml_backend_qnn_context *ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
+    if (nullptr == ctx) {
+        GGMLQNN_LOG_ERROR("pls check why ctx is null");
+        return "unknown";
+    }
+    return ctx->name;
+
+    GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) {
+    struct ggml_backend_qnn_context * ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
+    if (nullptr == ctx) {
+        GGMLQNN_LOG_ERROR("pls check why ctx is null");
+        return "unknown";
+    }
+    if (0 == strncmp(ctx->name, "qnn-npu", 7)) {
+        const char * soc_info = qnn_get_socmodel_desc(ctx->socinfo.soc_model);
+        const char * htp_arch = qnn_get_htparch_desc(ctx->socinfo.htp_arch);
+        std::string dev_desc = std::string(ctx->desc)
+                + std::string(soc_info) + "_" + std::string(htp_arch)
+                + "," + std::string(ctx->socinfo.soc_desc);
+        return dev_desc.c_str();
+    } else {
+        return ctx->desc;
+    }
+}
+
+static void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    //FIXME:this is NOT QNN device memory info
+    *free  = get_system_free_memory_in_bytes();
+    *total = get_system_total_memory_in_bytes();
+    GGML_UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+}
+
+static void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev,
+                                              struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_qnn_device_get_name(dev);
+    props->description = ggml_backend_qnn_device_get_description(dev);
+    props->type        = ggml_backend_qnn_device_get_type(dev);
+    ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+            /* .async                 = */ false,
+            /* .host_buffer           = */ false,
+            /* .buffer_from_host_ptr  = */ true,
+            /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_qnn_device_init_backend(ggml_backend_dev_t dev, const char * params) {
+    GGML_UNUSED(dev);
+    if (nullptr == params) {
+        params = 0;
+    }
+    ggml_backend_t qnn_backend = ggml_backend_qnn_init((int) (intptr_t) params,
+                                                       "/data/local/tmp/");
+
+    return qnn_backend;
+
+}
+
+ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) {
+    if (device_index >= GGML_QNN_MAX_DEVICES) {
+        GGMLQNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n",
+                      device_index, GGML_QNN_MAX_DEVICES - 1);
+        return nullptr;
+    }
+
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_qnn = {
+            /* .iface   = */ {
+                                     /* .get_name         = */ ggml_backend_qnn_buffer_type_name,
+                                     /* .alloc_buffer     = */ ggml_backend_qnn_buffer_type_alloc_buffer,
+                                     /* .get_alignment    = */ ggml_backend_qnn_buffer_type_get_alignment,
+                                     /* .get_max_size     = */ ggml_backend_qnn_buffer_type_get_max_size,
+                                     /* .get_alloc_size   = */ NULL,// defaults to ggml_nbytes
+                                     /* .is_host          = */ ggml_backend_qnn_buffer_is_host
+                             },
+            /* .context = */ NULL,
+    };
+
+    return &ggml_backend_buffer_type_qnn;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_qnn_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) dev->context;
+    return ggml_backend_qnn_buffer_type(ctx->device);
+}
+
+static ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_host_ptr(ggml_backend_dev_t dev,
+                                                void * ptr, size_t size, size_t max_tensor_size) {
+    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(max_tensor_size);
+}
+
+
+static bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) dev->context;
+    return (ggml_qnn_can_handle_op(op, true));
+}
+
+static bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(dev);
+    return ggml_backend_buft_is_host(buft);
+}
+
+static struct ggml_backend_device_i ggml_backend_qnn_device_interface = {
+        /* .get_name             = */ ggml_backend_qnn_device_get_name,
+        /* .get_description      = */ ggml_backend_qnn_device_get_description,
+        /* .get_memory           = */ ggml_backend_qnn_device_get_memory,
+        /* .get_type             = */ ggml_backend_qnn_device_get_type,
+        /* .get_props            = */ ggml_backend_qnn_device_get_props,
+        /* .init_backend         = */ ggml_backend_qnn_device_init_backend,
+        /* .get_buffer_type      = */ ggml_backend_qnn_device_get_buffer_type,
+        /* .get_host_buffer_type = */ NULL,
+        /* .buffer_from_host_ptr = */ ggml_backend_qnn_device_buffer_from_host_ptr,
+        /* .supports_op          = */ ggml_backend_qnn_device_supports_op,
+        /* .supports_buft        = */ ggml_backend_qnn_device_supports_buft,
+        /* .offload_op           = */ NULL,
+        /* .event_new            = */ NULL,
+        /* .event_free           = */ NULL,
+        /* .event_synchronize    = */ NULL,
+};
+
+static ggml_backend_i ggml_backend_qnn_interface = {
+        /* .get_name                = */ ggml_backend_qnn_name,
+        /* .free                    = */ ggml_backend_qnn_free,
+        /* .set_tensor_async        = */ nullptr,
+        /* .get_tensor_async        = */ nullptr,
+        /* .cpy_tensor_async        = */ nullptr,
+        /* .synchronize             = */ nullptr,
+        /* .graph_plan_create       = */ nullptr,
+        /* .graph_plan_free         = */ nullptr,
+        /* .graph_plan_update       = */ nullptr,
+        /* .graph_plan_compute      = */ nullptr,
+        /* .graph_compute           = */ ggml_backend_qnn_graph_compute,
+        /* .event_record            = */ nullptr,
+        /* .event_wait              = */ nullptr,
+};
+
+//FIXME: this guid is not make sense
+static ggml_guid_t ggml_backend_qnn_guid() {
+    static ggml_guid guid = {
+            0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81,
+            0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09
+    };
+    return &guid;
+}
+
+bool ggml_backend_is_qnn(ggml_backend_t backend) {
+    return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid());
+}
+
+void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_qnn(backend));
+
+    struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context;
+    ctx->threads = n_threads;
+}
+
+int ggml_backend_qnn_get_device_count() {
+    return GGML_QNN_MAX_DEVICES;
+}
+
+struct ggml_backend_qnn_reg_context {
+    std::vector<ggml_backend_dev_t> devices;
+};
+
+static const char * ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) {
+    return "ggml-qnn";
+
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    return GGML_QNN_MAX_DEVICES;
+}
+
+static ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    GGML_UNUSED(reg);
+    GGML_UNUSED(index);
+
+    GGMLQNN_LOG_DEBUG("index %d", index);
+    ggml_backend_qnn_reg_context * ctx = (ggml_backend_qnn_reg_context *)reg->context;
+    GGML_ASSERT(index < ctx->devices.size());
+    return ctx->devices[index];
+}
+
+static void * ggml_backend_qnn_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    GGML_UNUSED(reg);
+
+    if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
+        return (void *)ggml_backend_qnn_set_n_threads;
+    }
+    return NULL;
+}
+
+static const ggml_backend_reg_i ggml_backend_qnn_reg_interface = {
+        /* .get_name          = */ ggml_backend_qnn_reg_get_name,
+        /* .get_device_count  = */ ggml_backend_qnn_reg_get_device_count,
+        /* .get_device        = */ ggml_backend_qnn_reg_get_device,
+        /* .get_proc_address  = */ ggml_backend_qnn_reg_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_qnn_reg() {
+    static ggml_backend_reg reg;
+    static bool initialized = false;
+    GGMLQNN_LOG_DEBUG("enter ggml_backend_qnn_reg");
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            ggml_backend_qnn_reg_context * ctx = new ggml_backend_qnn_reg_context;
+
+            for (int i = 0; i < ggml_backend_qnn_get_device_count(); i++) {
+                ggml_backend_dev_t dev = new ggml_backend_device {
+                        /* .iface       = */ ggml_backend_qnn_device_interface,
+                        /* .reg         = */ &reg,
+                        /* .context     = */ &g_qnn_mgr[i]
+                };
+                ctx->devices.push_back(dev);
+            }
+
+            reg = ggml_backend_reg {
+                    /* .api_version = */ GGML_BACKEND_API_VERSION,
+                    /* .iface       = */ ggml_backend_qnn_reg_interface,
+                    /* .context     = */ ctx
+            };
+        }
+
+        initialized = true;
+    }
+    GGMLQNN_LOG_DEBUG("leave ggml_backend_qnn_reg");
+
+    return &reg;
+}
+
+/**
+ *
+ * @param device            0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU
+ * @param qnn_lib_path      QNN binrary runtime library path, such as "/data/local/tmp/" on Android or specified in JNI layer
+ * @return
+ */
+ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
+    int result = 0;
+
+    if (nullptr == qnn_lib_path)
+        return nullptr;
+
+    GGMLQNN_LOG_DEBUG("device %d", device);
+    GGMLQNN_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path);
+    if (device >= GGML_QNN_MAX_DEVICES) {
+        GGMLQNN_LOG_ERROR("invalid device %d", device);
+        return nullptr;
+    }
+
+    if (nullptr != g_qnn_mgr[device].backend) {
+        GGMLQNN_LOG_WARN("qnn backend %d(%s) already loaded", device, ggml_backend_qnn_get_devname(device));
+        return g_qnn_mgr[device].backend;
+    }
+
+    std::string path = qnn_lib_path;
+    if (QNN_BACKEND_NPU == device) {
+        if (0 == setenv("LD_LIBRARY_PATH",
+                        (path +
+                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
+                        1)) {
+            GGMLQNN_LOG_INFO("QNN NPU backend setenv successfully");
+        } else {
+            GGMLQNN_LOG_ERROR("QNN NPU backend setenv failure");
+        }
+        if (0 == setenv("ADSP_LIBRARY_PATH",
+                        (path +
+                         ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(),
+                        1)) {
+            GGMLQNN_LOG_INFO("QNN NPU backend setenv successfully");
+        } else {
+            GGMLQNN_LOG_ERROR("QNN NPU backend setenv failure");
+        }
+    } else {
+        if (0 == setenv("LD_LIBRARY_PATH",
+                        (path +
+                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
+                        1)) {
+            GGMLQNN_LOG_INFO("%s backend setenv successfully\n", ggml_backend_qnn_get_devname(device));
+        } else {
+            GGMLQNN_LOG_ERROR("%s backend setenv failure\n", ggml_backend_qnn_get_devname(device));
+        }
+    }
+
+    qnn_instance * instance = nullptr;
+    instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, "");
+    result = instance->qnn_init(nullptr);
+    if (0 != result) {
+        GGMLQNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", ggml_backend_qnn_get_devname(device));
+        delete instance;
+        return nullptr;
+    }
+    qnn_interface qnn_interface                             = instance->get_qnn_interface();
+    if (!qnn_interface.is_loaded()) {
+        GGMLQNN_LOG_WARN("qnn subsystem failure\n");
+        delete instance;
+        return nullptr;
+    }
+
+    std::string device_name = ggml_backend_qnn_get_devname(device);
+    GGMLQNN_LOG_INFO("qnn device name %s", device_name.c_str());
+    g_qnn_mgr[device].instance                  = instance;
+    g_qnn_mgr[device].raw_interface             = instance->get_qnn_raw_interface();
+    g_qnn_mgr[device].raw_system_interface      = instance->get_qnn_raw_system_interface();
+
+    ggml_backend_t qnn_backend = new ggml_backend{
+            /* .guid      = */ ggml_backend_qnn_guid(),
+            /* .iface     = */ ggml_backend_qnn_interface,
+            /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_qnn_reg(), device),
+            /* .context   = */ &g_qnn_mgr[device]
+    };
+    g_qnn_mgr[device].backend   = qnn_backend;
+
+    return qnn_backend;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_qnn_reg)
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
new file mode 100755
index 0000000000000..412ccadadaf6b
--- /dev/null
+++ b/scripts/build-run-android.sh
@@ -0,0 +1,202 @@
+#!/bin/bash
+
+set -e
+
+PWD=`pwd`
+ANDROID_PLATFORM=android-34
+ANDROID_NDK=${PWD}/android-ndk-r26c
+REMOTE_PATH=/data/local/tmp/
+GGUF_MODEL_NAME=/sdcard/deepseek-r1-distill-qwen-1.5b-q4_0.gguf
+
+#QNN SDK could be found at:
+#https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
+QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.31.0.250130/
+
+function dump_vars()
+{
+    echo -e "ANDROID_NDK:          ${ANDROID_NDK}"
+    echo -e "QNN_SDK_PATH:         ${QNN_SDK_PATH}"
+}
+
+
+function show_pwd()
+{
+    echo -e "current working path:$(pwd)\n"
+}
+
+
+function check_qnn_sdk()
+{
+    if [ ! -d ${QNN_SDK_PATH} ]; then
+        echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from ${QNN_SDK_URL}...\n"
+        exit 1
+    fi
+}
+
+
+function check_and_download_ndk()
+{
+    is_android_ndk_exist=1
+
+    if [ ! -d ${ANDROID_NDK} ]; then
+        is_android_ndk_exist=0
+    fi
+
+    if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then
+        is_android_ndk_exist=0
+    fi
+
+    if [ ${is_android_ndk_exist} -eq 0 ]; then
+
+        if [ ! -f android-ndk-r26c-linux.zip ]; then
+            wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip  https://dl.google.com/android/repository/android-ndk-r26c-linux.zip
+        fi
+
+        unzip android-ndk-r26c-linux.zip
+
+        if [ $? -ne 0 ]; then
+            printf "failed to download android ndk to %s \n" "${ANDROID_NDK}"
+            exit 1
+        fi
+
+        printf "android ndk saved to ${ANDROID_NDK} \n\n"
+    else
+        printf "android ndk already exist:${ANDROID_NDK} \n\n"
+    fi
+}
+
+
+function build_arm64
+{
+    cmake -H. -B./out/android -DGGML_USE_QNN=ON -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
+    cd out/android
+    make -j16
+    show_pwd
+
+    cd -
+}
+
+
+function remove_temp_dir()
+{
+    if [ -d out ]; then
+        echo "remove out directory in `pwd`"
+        rm -rf out
+    fi
+}
+
+
+function check_qnn_libs()
+{
+    #reuse the cached qnn libs on Android phone
+    adb shell ls ${REMOTE_PATH}/libQnnCpu.so
+    if [ $? -eq 0 ]; then
+        printf "QNN libs already exist on Android phone\n"
+    else
+        update_qnn_libs
+    fi
+}
+
+
+function update_qnn_libs()
+{
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so              ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so                 ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so                 ${REMOTE_PATH}/
+
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so                 ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so          ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so          ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so     ${REMOTE_PATH}/
+}
+
+
+function build_ggml_qnn()
+{
+    show_pwd
+    check_and_download_ndk
+    check_qnn_sdk
+    dump_vars
+    remove_temp_dir
+    build_arm64
+}
+
+
+function run_llamacli()
+{
+    check_qnn_libs
+
+    if [ -f ./out/android/bin/libggml-qnn.so ]; then
+        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
+    fi
+    adb push ./out/android/bin/llama-cli ${REMOTE_PATH}/
+    adb shell chmod +x ${REMOTE_PATH}/llama-cli
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-cli -mg 2 -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
+
+}
+
+function run_test-backend-ops()
+{
+    check_qnn_libs
+
+    if [ -f ./out/android/bin/libggml-qnn.so ]; then
+        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
+    fi
+    adb push ./out/android/bin/test-backend-ops ${REMOTE_PATH}/
+    adb shell chmod +x ${REMOTE_PATH}/test-backend-ops
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test"
+
+}
+
+
+function show_usage()
+{
+    echo "Usage:"
+    echo "  $0 build"
+    echo "  $0 updateqnnlib"
+    echo "  $0 run_llamacli"
+    echo "  $0 run_testop"
+    echo -e "\n\n\n"
+}
+
+
+show_pwd
+
+check_qnn_sdk
+
+if [ $# == 0 ]; then
+    show_usage
+    exit 1
+elif [ $# == 1 ]; then
+    if [ "$1" == "-h" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "help" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "build" ]; then
+        build_ggml_qnn
+        exit 0
+    elif [ "$1" == "run_llamacli" ]; then
+        run_llamacli
+        exit 0
+    elif [ "$1" == "run_testop" ]; then
+        run_test-backend-ops
+        exit 0
+    elif [ "$1" == "updateqnnlib" ]; then
+        update_qnn_libs
+        exit 0
+    fi
+else
+    show_usage
+    exit 1
+fi

From f4758385facb12199fe67eaa9f7bfa54b63e2dbc Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sat, 15 Feb 2025 11:14:13 +0800
Subject: [PATCH 063/200] ggml-qnn: santiy check

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 67 ++++++++++++++++------------------
 1 file changed, 31 insertions(+), 36 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index d29c6cb6f9222..780bc3553ab0f 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -1,9 +1,6 @@
 /*
  * Copyright (c) 2023-2024 The ggml authors
  *
- * this is implementation of ggml-qnn(ggml-qnn backend of Qualcomm QNN(Qualcomm Neural Network,
- * aka Qualcomm AI Engine Direct)
- *
  * Qualcomm QNN SDK and reference tech guides could be found at:
  * https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
  * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
@@ -17,7 +14,7 @@
  * section-6 does implementation of ggml-qnn backend according to ggml's backend subsystem
  *
  * currently only provide GGML_OP_ADD's QNN backend implementation:
- *    - GGML_OP_ADD:  this is skeleton, can expand other ggml ops as expertise
+ *    - GGML_OP_ADD: this is skeleton, can expand other ggml ops according to expertise
  *
  * of course, can porting ggml-qnn to Windows on ARM as need.
  *
@@ -105,10 +102,6 @@ class qnn_instance;
 struct ggml_backend_qnn_context;
 static int free_qnn_tensor(Qnn_Tensor_t * tensor);
 static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-
-#if (defined __ANDROID__) || (defined ANDROID)
-extern "C" int __android_log_print(int prio, const char * tag, const char * fmt, ...) __attribute__((__format__(printf, 3, 4)));
-#endif
 static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
 
 // =================================================================================================
@@ -142,13 +135,13 @@ static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const
         int len = vsnprintf(s_ggmlqnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args);
         if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) {
 #if (defined __ANDROID__) || (defined ANDROID)
-            //for Android APK
+            //for Android application(standard APP or command line tool)
             __android_log_print(ANDROID_LOG_INFO, "ggml-qnn", "%s\n", s_ggmlqnn_log_internal_buf);
 #endif
 #if (defined __ANDROID__) || (defined ANDROID)
-            //do nothing when running on Android phone
+            //do nothing when running on Snapdragon based Android device
 #else
-            //for Windows on ARM
+            //for Snapdragon based WoA(Windows on ARM) device
             printf("%s\n", s_ggmlqnn_log_internal_buf);
 #endif
         }
@@ -851,7 +844,6 @@ static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
             free(src_qparam.bwAxisScaleOffsetEncoding.offsets);
         }
     }
-    //GGMLQNN_LOG_DEBUG("tensor dims %p", QNN_TENSOR_GET_DIMENSIONS(*tensor));
     free(QNN_TENSOR_GET_DIMENSIONS(*tensor));
     free(tensor);
 
@@ -1367,8 +1359,8 @@ static struct qcom_socinfo * qnn_get_socinfo_from_socmodel(uint32_t soc_model) {
     return nullptr;
 }
 
-static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                                const ggml_tensor * src1, ggml_tensor * dst) {
+static bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
+                                    const ggml_tensor * src1, ggml_tensor * dst) {
     if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
         GGMLQNN_LOG_WARN("invalid params\n");
         return false;
@@ -1383,9 +1375,9 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso
     return true;
 }
 
-#define CHECK_PARAMS(ctx, src0, src1, dst)                          \
+#define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                          \
     do {                                                            \
-        if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) {   \
+        if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {   \
             return;                                                 \
         }                                                           \
     } while (0)
@@ -1516,7 +1508,7 @@ static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
     return GGML_TYPE_COUNT;
 }
 
-//TODO:
+//TODO: add more ops
 static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) {
     switch (ggmlop) {
         case GGML_OP_ADD:
@@ -1540,7 +1532,7 @@ static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & o
     int len = 0;
     switch (ggml_n_dims(tensor)) {
         case 1:
-            len = snprintf(buffer, sizeof(buffer), "%ld%s", (long)tensor->ne[0], type_name);
+            len = snprintf(buffer, sizeof(buffer), "%ldx1%s", (long)tensor->ne[0], type_name);
             break;
         case 2:
             len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name);
@@ -1913,7 +1905,7 @@ class qnn_instance {
     void unregister_rpcmem();
     void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
 
-    void *alloc_rpcmem(size_t bytes, size_t alignment);
+    void * alloc_rpcmem(size_t bytes, size_t alignment);
 
     void free_rpcmem(void * buf);
 
@@ -2252,7 +2244,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
     _loaded_lib_handle[backend_id] = lib_handle;
     _backend_id = backend_id;
 
-#if 0 //not used in PR, keep them here for further use
+#if 0 // keep them here for further use
     QnnSaver_Config_t outputdir_cfg;
     outputdir_cfg.option = QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY;
     outputdir_cfg.outputDirectory = "/data/local/tmp/";
@@ -2307,8 +2299,8 @@ int qnn_instance::load_system() {
     _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
     if (nullptr == _system_lib_handle) {
         GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
-        //re-try with Android APK's internal QNN runtime lib path
-        _lib_path = "/data/data/com.cdeos.kantv/qnnlib/";
+        //re-try with default path of QNN binary runtime lib
+        _lib_path = "/data/local/tmp/";
         system_lib_path = _lib_path + "libQnnSystem.so";
         _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
         if (nullptr == _system_lib_handle) {
@@ -2604,7 +2596,6 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         }
         _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
 
-        //TODO: faster approach to probe the accurate capacity of QNN RPC ion memory
         size_t candidate_size = 0;
         uint8_t * rpc_buffer = nullptr;
         const int SIZE_IN_MB = (1 << 20);
@@ -2648,7 +2639,7 @@ int qnn_instance::qnn_finalize() {
     //FIXME:should be removed in the future
     reset_idx();
 
-    if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's mobile SoC equipped low-end phone happy
+    if (nullptr != _pfn_rpc_mem_deinit)
         _pfn_rpc_mem_deinit();
 
     if (dlclose(_rpc_lib_handle) != 0) {
@@ -2922,8 +2913,8 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum
         }
 #if 0
         //TODO: offload mul_mat to QNN backend
-        //we need to process type traint in func ggml_qnn_mul_mat(...) with following case:
-        //src0: q4_0, q6_k
+        //need to process type trait in func ggml_qnn_mul_mat(...):
+        //src0: q4_0, q6_k, ...
         //src1: f32
         //dst : f32
         return (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
@@ -2959,13 +2950,15 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
     Qnn_DataType_t src0_qnn_type                = QNN_DATATYPE_FLOAT_32;
     Qnn_DataType_t src1_qnn_type                = QNN_DATATYPE_FLOAT_32;
     Qnn_DataType_t dst_qnn_type                 = QNN_DATATYPE_FLOAT_32;
+    const ggml_tensor * src0                    = op->src[0];
+    const ggml_tensor * src1                    = op->src[1];
+    ggml_tensor * dst                           = op;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
 
     instance                                    = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
 
-    const ggml_tensor * src0 = op->src[0];
-    const ggml_tensor * src1 = op->src[1];
-    ggml_tensor * dst        = op;
     op_perf.start();
 
     std::string map_entry;
@@ -3174,17 +3167,17 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
 #endif
 }
 
-//TODO: type trait with op->src[0]
+//TODO:
 /*
- * the procedure of ggml_qnn_mul_mat is similar to ggml_qnn_add,but there are type trait process
- * for ggml_qnn_mul_mat, so it's a standalone function.
+ * the logic of ggml_qnn_mul_mat is similar to ggml_qnn_add,but type trait and matrix transpose are required
+ * for offload mulmat to QNN backend, so it's a standalone function.
  *
  * MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, we should focus on MUL_MAT.
  *
  * we have three kinds of MUL_MAT to compute:
  * mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
  * mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
- * mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, ...) and src1 is F32, quantize in src0 -> f32 in src0', then src0' * src1
+ * mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, ...) and src1 is F32, src0 -> f32 in src0', then src0' * src1
 */
 static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
     Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
@@ -3205,13 +3198,15 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
     Qnn_DataType_t src0_qnn_type                = QNN_DATATYPE_FLOAT_32;
     Qnn_DataType_t src1_qnn_type                = QNN_DATATYPE_FLOAT_32;
     Qnn_DataType_t dst_qnn_type                 = QNN_DATATYPE_FLOAT_32;
+    const ggml_tensor * src0                    = op->src[0];
+    const ggml_tensor * src1                    = op->src[1];
+    ggml_tensor * dst                           = op;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
 
     instance                                    = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
 
-    const ggml_tensor * src0 = op->src[0];
-    const ggml_tensor * src1 = op->src[1];
-    ggml_tensor * dst        = op;
     op_perf.start();
 
     std::string map_entry;

From edff40a72d8839f2c4706d4dacf747a3fad801ce Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 16 Feb 2025 21:35:24 +0800
Subject: [PATCH 064/200] ggml-qnn: update script build-run-android.sh to
 compare peformance of ggml-qnn

---
 scripts/build-run-android.sh | 48 ++++++++++++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 5 deletions(-)

diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 412ccadadaf6b..63614e6afe110 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -7,6 +7,7 @@ ANDROID_PLATFORM=android-34
 ANDROID_NDK=${PWD}/android-ndk-r26c
 REMOTE_PATH=/data/local/tmp/
 GGUF_MODEL_NAME=/sdcard/deepseek-r1-distill-qwen-1.5b-q4_0.gguf
+GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
 
 #QNN SDK could be found at:
 #https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
@@ -14,6 +15,9 @@ GGUF_MODEL_NAME=/sdcard/deepseek-r1-distill-qwen-1.5b-q4_0.gguf
 QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
 QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.31.0.250130/
 
+#default is QNN NPU
+qnnbackend=2
+
 function dump_vars()
 {
     echo -e "ANDROID_NDK:          ${ANDROID_NDK}"
@@ -137,10 +141,28 @@ function run_llamacli()
 
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-cli -mg 2 -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
+               && ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
 
 }
 
+
+function run_llamabench()
+{
+    check_qnn_libs
+
+    if [ -f ./out/android/bin/libggml-qnn.so ]; then
+        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
+    fi
+    adb push ./out/android/bin/llama-bench ${REMOTE_PATH}/
+    adb shell chmod +x ${REMOTE_PATH}/llama-bench
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-bench -mg ${qnnbackend} -m ${GGUF_MODEL_NAME}"
+
+}
+
+
 function run_test-backend-ops()
 {
     check_qnn_libs
@@ -163,8 +185,9 @@ function show_usage()
     echo "Usage:"
     echo "  $0 build"
     echo "  $0 updateqnnlib"
-    echo "  $0 run_llamacli"
     echo "  $0 run_testop"
+    echo "  $0 run_llamacli     0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_llamabench   0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
     echo -e "\n\n\n"
 }
 
@@ -186,15 +209,30 @@ elif [ $# == 1 ]; then
     elif [ "$1" == "build" ]; then
         build_ggml_qnn
         exit 0
-    elif [ "$1" == "run_llamacli" ]; then
-        run_llamacli
-        exit 0
+
     elif [ "$1" == "run_testop" ]; then
         run_test-backend-ops
         exit 0
     elif [ "$1" == "updateqnnlib" ]; then
         update_qnn_libs
         exit 0
+    else
+        show_usage
+        exit 1
+    fi
+elif [ $# == 2 ]; then
+    qnnbackend=$2
+    if [ ${qnnbackend} -gt 3 ]; then
+        show_usage
+        exit 1
+    fi
+
+    if [ "$1" == "run_llamacli" ]; then
+        run_llamacli
+        exit 0
+    elif [ "$1" == "run_llamabench" ]; then
+        run_llamabench
+        exit 0
     fi
 else
     show_usage

From 12bc7ed242ac982c1b65b511e3bdac118d2789f9 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Mon, 17 Feb 2025 19:01:06 +0800
Subject: [PATCH 065/200] ggml-qnn: fix minor issue in test-backend-ops.cpp

---
 tests/test-backend-ops.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index d70acb7719435..9aa1783202d67 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4683,7 +4683,11 @@ int main(int argc, char ** argv) {
             continue;
         }
 
+#ifdef GGML_USE_QNN
+        ggml_backend_t backend = ggml_backend_dev_init(dev, reinterpret_cast<const char *>(i));
+#else
         ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
+#endif
         GGML_ASSERT(backend != NULL);
 
         ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);

From df5300509b5ba5b6164e4b871d6367fa34d4e0a5 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 18 Feb 2025 09:53:57 +0800
Subject: [PATCH 066/200] ggml-qnn: merge QNN RPC feature from
 https://github.com/zhouwg/kantv/blob/ggml-qnn-quantize/core/ggml/llamacpp/ggml-qnn.cpp

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 534 +++++++++++++++++++--------------
 1 file changed, 307 insertions(+), 227 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 780bc3553ab0f..810711e41acc7 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -266,6 +266,13 @@ static void * ggmlqnn_host_malloc(size_t n) {
     }                                                   \
   } while (0)
 
+#define CHECK_QNN_API(error)                            \
+    do {                                                \
+        if (QNN_SUCCESS != (error)) {                   \
+            GGMLQNN_LOG_INFO("error = %d\n", (error));  \
+        }                                               \
+    } while (0)
+
 #define VALIDATE_TENSOR_VERSION(tensor, err)            VALIDATE(validate_tensor_version(tensor), err)
 
 #define VALIDATE_OP_CONFIG_VERSION(op, err)             VALIDATE(validate_op_config_version(op), err)
@@ -1175,40 +1182,20 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .socinfo              = {}},
 };
 
-using ggml_dimension_array_t = int64_t[GGML_MAX_DIMS];
-using qnn_dimension_array_t = std::array<uint32_t, GGML_MAX_DIMS>;
-using op_dims_calc_func_t = void (*)(const std::vector<const ggml_dimension_array_t> & input_dims,
-                                     ggml_dimension_array_t & output_dims);
-
-static void element_wise_op_dims(const std::vector<const ggml_dimension_array_t> & input_dims,
-                                 ggml_dimension_array_t &output_dims) {
-    for (size_t i = 1; i < std::size(output_dims); i++) {
-        output_dims[i] = input_dims.front()[i];
-    }
-}
-
-static void mat_mul_op_dims(const std::vector<const ggml_dimension_array_t> & input_dims,
-                            ggml_dimension_array_t & output_dims) {
-    GGML_ASSERT(input_dims.size() == 2);
-    output_dims[0] = input_dims.front()[1];
-    output_dims[1] = input_dims.back()[1];
-}
 
 struct qnn_op_caps_t {
     const char * qnn_op_name = nullptr;
     const size_t input_param_count = 0;
-    op_dims_calc_func_t calc_dims_func = nullptr;
     const char * qnn_param_name = nullptr;
 };
 
-constexpr static const qnn_op_caps_t kOpCaps[] = {
+static const qnn_op_caps_t kOpCaps[] = {
         {}, // GGML_OP_NONE
         {}, // GGML_OP_DUP
         {
                 // GGML_OP_ADD
                 QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name
-                2,                       // input_param_count
-                element_wise_op_dims,    // calc_dims_func
+                2,                   // input_param_count
         },
         {}, // GGML_OP_ADD1
         {}, // GGML_OP_ACC
@@ -1237,7 +1224,6 @@ constexpr static const qnn_op_caps_t kOpCaps[] = {
                 // GGML_OP_MUL_MAT
                 QNN_OP_MAT_MUL,  // qnn_op_name
                 2,               // input_param_count
-                mat_mul_op_dims, // calc_dims_func
         },
         {}, // GGML_OP_MUL_MAT_ID
         {}, // GGML_OP_OUT_PROD
@@ -1885,7 +1871,7 @@ class qnn_instance {
         return 0;
     }
 
-    std::string &get_qnn_graph_name() { return _graph_name; }
+    std::string & get_qnn_graph_name() { return _graph_name; }
 
     bool is_rpcmem_initialized() {
         return _rpcmem_initialized;
@@ -1906,8 +1892,10 @@ class qnn_instance {
     void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
 
     void * alloc_rpcmem(size_t bytes, size_t alignment);
+    void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle);
 
     void free_rpcmem(void * buf);
+    void free_rpcmem();
 
     bool is_rpcmem_allocated(void * buf);
 
@@ -1915,6 +1903,10 @@ class qnn_instance {
         return _qnn_mem_set.count(handle) != 0U;
     }
 
+    bool enalbe_qnn_rpc() {
+        return _enable_qnn_rpc;
+    }
+
 public:
     std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *, Qnn_Tensor_t *, Qnn_Tensor_t *>> _qnn_graph_map;
 
@@ -1975,15 +1967,16 @@ class qnn_instance {
     QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
     QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
 
-    std::unordered_set<Qnn_MemHandle_t> _qnn_mem_set;
+    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
     std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
 
+
     static std::mutex _init_mutex;
     static std::unordered_map<BackendIdType, void *> _loaded_lib_handle;
     static std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
     static std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
 
-    void *_rpc_lib_handle = nullptr;
+    void * _rpc_lib_handle = nullptr;
     std::atomic_bool _rpcmem_initialized{false};
     pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
     pfn_rpc_mem_free _pfn_rpc_mem_free;
@@ -1995,6 +1988,7 @@ class qnn_instance {
 
     std::string _graph_name;
     QNNBackend _device_id;
+    bool       _enable_qnn_rpc = false; //FIXME:unknown issue with QNN RPC feature
 };
 
 std::mutex qnn_instance::_init_mutex;
@@ -2032,11 +2026,30 @@ void qnn_instance::free_rpcmem(void * buf) {
     } else if (0 == _rpcmem_store_map.count(buf)) {
         GGMLQNN_LOG_WARN("no allocated tensor\n");
     } else {
+        GGMLQNN_LOG_DEBUG("free rpc mem %p", _rpcmem_store_map[buf]);
         _pfn_rpc_mem_free(_rpcmem_store_map[buf]);
         _rpcmem_store_map.erase(buf);
     }
 }
 
+void qnn_instance::free_rpcmem() {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+
+    if (_rpcmem_store_map.empty()) {
+        GGMLQNN_LOG_WARN("no rpcmem allocated\n");
+        return;
+    }
+
+    for (std::unordered_map<void *, void *>::iterator it = _rpcmem_store_map.begin();
+         it != _qnn_mem_set.end();
+         it++) {
+        void * rpcbuffer = it->second;
+        GGMLQNN_LOG_DEBUG("free rpc buffer %p", rpcbuffer);
+        _pfn_rpc_mem_free(rpcbuffer);
+    }
+    _rpcmem_store_map.clear();
+}
+
 int32_t qnn_instance::rpcmem_to_fd(void * buf) {
     int32_t mem_fd = -1;
     if (!is_rpcmem_initialized()) {
@@ -2059,10 +2072,6 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
         return 2;
     }
 
-    if (is_rpcmem_allocated(p_data)) {
-        GGMLQNN_LOG_WARN("rpc memory already allocated\n");
-        //return 3;
-    }
     if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
         GGMLQNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
         return 4;
@@ -2094,7 +2103,7 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
         GGMLQNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
     }
     QNN_VER_PTR(*p_tensor)->memHandle = handle;
-    _qnn_mem_set.insert(handle);
+    _qnn_mem_set.insert((std::pair<void*, Qnn_MemHandle_t>(p_data, handle)));
 
     return 0;
 }
@@ -2136,6 +2145,19 @@ Qnn_MemHandle_t  qnn_instance::register_rpcmem(void * p_data, const uint32_t ran
     return handle;
 }
 
+void * qnn_instance::get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) {
+    for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin();
+         it != _qnn_mem_set.end();
+         it++) {
+        Qnn_MemHandle_t mem_handle = it->second;
+        if (it->second == mem_handle) {
+            return it->first;
+        }
+    }
+    GGMLQNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle);
+    return nullptr;
+}
+
 void qnn_instance::unregister_rpcmem() {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
@@ -2143,10 +2165,16 @@ void qnn_instance::unregister_rpcmem() {
         GGMLQNN_LOG_WARN("no rpcmem registered\n");
     }
 
-    for (auto &mem_handle : _qnn_mem_set) {
+    for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin();
+         it != _qnn_mem_set.end();
+         it++) {
+        Qnn_MemHandle_t mem_handle = it->second;
         error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
         if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error));
+            GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d\n",
+                         QNN_GET_ERROR_CODE(error));
+        } else {
+            GGMLQNN_LOG_DEBUG("unregister shared memory ok");
         }
     }
     _qnn_mem_set.clear();
@@ -2158,14 +2186,14 @@ void qnn_instance::unregister_rpcmem(Qnn_MemHandle_t mem_handle) {
         GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error));
     }
 
-    auto it = std::find_if(_qnn_rpc_buffer_to_handles.begin(), _qnn_rpc_buffer_to_handles.end(),
+    auto it = std::find_if(_qnn_mem_set.begin(), _qnn_mem_set.end(),
                            [mem_handle](const auto &kv) { return kv.second == mem_handle; });
-    if (it == _qnn_rpc_buffer_to_handles.end()) {
+    if (it == _qnn_mem_set.end()) {
         GGMLQNN_LOG_WARN("failed to find shared memory handler: %p", mem_handle);
         return;
     }
 
-    _qnn_rpc_buffer_to_handles.erase(it);
+    _qnn_mem_set.erase(it);
 }
 
 bool qnn_instance::is_rpcmem_allocated(void * buf) {
@@ -2562,7 +2590,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
                                temp_context_config.empty() ? nullptr : temp_context_config.data(),
                                &_qnn_context_handle);
     if (nullptr == _qnn_context_handle) {
-        GGMLQNN_LOG_WARN("why failed to initialize qnn context\n");
+        GGMLQNN_LOG_WARN("why failed to initialize qnn context, error:%s\n", strerror(errno));
         return 8;
     } else {
         GGMLQNN_LOG_DEBUG("initialize qnn context successfully\n");
@@ -2636,9 +2664,13 @@ int qnn_instance::qnn_finalize() {
     int ret_status = 0;
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
+    GGMLQNN_LOG_DEBUG("enter %s\n", __func__);
     //FIXME:should be removed in the future
     reset_idx();
 
+    free_rpcmem();
+    unregister_rpcmem();
+
     if (nullptr != _pfn_rpc_mem_deinit)
         _pfn_rpc_mem_deinit();
 
@@ -2700,6 +2732,7 @@ int qnn_instance::qnn_finalize() {
     unload_backend();
 
     unload_system();
+    GGMLQNN_LOG_DEBUG("leave %s\n", __func__);
 
     return ret_status;
 }
@@ -2812,10 +2845,133 @@ int qnn_instance::finalize_qnn_graph() {
     return 0;
 }
 
+static uint8_t * create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) {
+    if (nullptr == instance || nullptr == ggml_tensor || nullptr == qnn_tensor) {
+        GGMLQNN_LOG_WARN("invalid params\n");
+        return nullptr;
+    }
+
+    uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(ggml_tensor), 4));
+    if (nullptr == qnn_rpcbuffer) {
+        GGMLQNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
+        return nullptr;
+    } else {
+        GGMLQNN_LOG_DEBUG("alloc rpcmem %p successfully\n", qnn_rpcbuffer);
+    }
+    if (b_copydata)
+        memcpy(qnn_rpcbuffer, ggml_tensor->data, ggml_nbytes(ggml_tensor));
+    instance->register_rpcmem(qnn_rpcbuffer, qnn_tensor);
+    return qnn_rpcbuffer;
+}
+
+static Qnn_ErrorHandle_t create_htp_graph(ggml_backend_qnn_context * ctx, const std::string & graph_name, Qnn_GraphHandle_t * graph_handle) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    if (nullptr == ctx)
+        return QNN_MIN_ERROR_COMMON;
+
+    qnn_instance * instance = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+
+    QnnHtpGraph_CustomConfig_t hvx_config;
+    hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
+    hvx_config.numHvxThreads = 4;
+    QnnGraph_Config_t graph_hvx_config;
+    graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+    graph_hvx_config.customConfig = &hvx_config;
+
+    QnnHtpGraph_CustomConfig_t dlbc_config;
+    dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+    dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
+    dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
+    QnnGraph_Config_t graph_dlbc_config;
+    graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+    graph_dlbc_config.customConfig = &dlbc_config;
+
+    QnnHtpGraph_CustomConfig_t opt_config;
+    opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
+    opt_config.optimizationOption.floatValue = 3;    // 1 or 3
+    QnnGraph_Config_t graph_opt_config;
+    graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+    graph_opt_config.customConfig = &opt_config;
+
+    QnnHtpGraph_CustomConfig_t vtcm_config;
+    vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
+    vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
+    QnnGraph_Config_t graph_vtcm_config;
+    graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+    graph_vtcm_config.customConfig = &vtcm_config;
+
+    QnnHtpGraph_CustomConfig_t precision_config;
+    precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
+    precision_config.precision = QNN_PRECISION_FLOAT16;
+    QnnGraph_Config_t graph_precision_config;
+    graph_precision_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+    graph_precision_config.customConfig = &precision_config;
+
+    const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config,
+                                                 &graph_dlbc_config,
+                                                 &graph_vtcm_config,
+                                                 &graph_opt_config,
+                                                 &graph_precision_config,
+                                                 NULL};
+    error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
+                                          graph_name.c_str(),
+                                          p_graphconfig, graph_handle);
+    return error;
+}
+
+static void print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    //skip sanity check of params
+    GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
+    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                      src0->name,
+                      src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
+                      src0->nb[0], src0->nb[1], src0->nb[2]);
+    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                      src1->name,
+                      src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
+                      src1->nb[0], src1->nb[1], src1->nb[2]);
+    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                      dst->name,
+                      dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
+                      dst->nb[1], dst->nb[2]);
+    GGMLQNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
+    GGMLQNN_LOG_DEBUG("tensor0 name %s", src0->name);
+    GGMLQNN_LOG_DEBUG("tensor1 name %s", src1->name);
+    GGMLQNN_LOG_DEBUG("tensor2 name %s", dst->name);
+}
+
+static void dump_tensors_info(const struct ggml_tensor * tensor) {
+    //skip sanity check of params
+    struct ggml_tensor * src0 = tensor->src[0];
+    struct ggml_tensor * src1 = tensor->src[1];
+    GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op),
+                      ggml_type_name(tensor->type));
+    GGMLQNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type));
+    GGMLQNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type));
+    GGMLQNN_LOG_DEBUG(
+            "src0 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+            src0->name,
+            src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
+            src0->nb[0], src0->nb[1], src0->nb[2]);
+    GGMLQNN_LOG_DEBUG(
+            "src1 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+            src1->name,
+            src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
+            src1->nb[0], src1->nb[1], src1->nb[2]);
+    GGMLQNN_LOG_DEBUG(
+            "     %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+            tensor->name,
+            tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1],
+            tensor->ne[2],
+            tensor->nb[0],
+            tensor->nb[1], tensor->nb[2]);
+}
+
 // =================================================================================================
 //  section-6: implementation of ggml-qnn backend
 // =================================================================================================
-static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dump_tensor_info) {
+static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     if (tensor->op == GGML_OP_NONE) {
         return true;
     }
@@ -2846,32 +3002,12 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum
         if (!ggml_are_same_shape(src0, src1)) {
             return false;
         }
-#if GGMLQNN_PRINT_OP_ADD_LOG
-        if (b_dump_tensor_info) {
-            GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op),
-                              ggml_type_name(tensor->type));
-            GGMLQNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type));
-            GGMLQNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type));
-            GGMLQNN_LOG_DEBUG("GGML_OP_ADD");
-            GGMLQNN_LOG_DEBUG(
-                    "src0 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-                    src0->name,
-                    src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
-                    src0->nb[0], src0->nb[1], src0->nb[2]);
-            GGMLQNN_LOG_DEBUG(
-                    "src1 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-                    src1->name,
-                    src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
-                    src1->nb[0], src1->nb[1], src1->nb[2]);
-            GGMLQNN_LOG_DEBUG(
-                    "     %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-                    tensor->name,
-                    tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1],
-                    tensor->ne[2],
-                    tensor->nb[0],
-                    tensor->nb[1], tensor->nb[2]);
 
-        }
+        if (ne00 < 32)
+            return false;
+        
+#if GGMLQNN_PRINT_OP_ADD_LOG
+        dump_tensors_info(tensor);
 #endif
         return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
                && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
@@ -2880,31 +3016,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum
 
     if (tensor->op == GGML_OP_MUL_MAT) {
 #if GGMLQNN_PRINT_OP_MUL_MAT_LOG
-        if (b_dump_tensor_info) {
-            GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op),
-                              ggml_type_name(tensor->type));
-            GGMLQNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type));
-            GGMLQNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type));
-            GGMLQNN_LOG_DEBUG("dst  type:%s", ggml_type_name(tensor->type));
-            GGMLQNN_LOG_DEBUG(
-                    "src0 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-                    src0->name,
-                    src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
-                    src0->nb[0], src0->nb[1], src0->nb[2]);
-            GGMLQNN_LOG_DEBUG(
-                    "src1 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-                    src1->name,
-                    src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
-                    src1->nb[0], src1->nb[1], src1->nb[2]);
-            GGMLQNN_LOG_DEBUG(
-                    "dst  %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-                    tensor->name,
-                    tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1],
-                    tensor->ne[2],
-                    tensor->nb[0],
-                    tensor->nb[1], tensor->nb[2]);
-
-        }
+        dump_tensors_info(tensor);
 #endif
         //FIXME: 2048 is an experimental value between ASR inference and LLM inference because
         //       it's better only offload big matrix to QNN backend
@@ -2920,7 +3032,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum
         return (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
                 && (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16);
 #else
-        //passthrough mul_mat
+        //fall back to ggml cpu backend
         return  (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
                 && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
                 && (src0->type == src1->type) && (src0->type == tensor->type);
@@ -2954,6 +3066,10 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
     const ggml_tensor * src1                    = op->src[1];
     ggml_tensor * dst                           = op;
 
+    uint8_t * qnn_rpcbuffer_0                   = nullptr;
+    uint8_t * qnn_rpcbuffer_1                   = nullptr;
+    uint8_t * qnn_rpcbuffer_2                   = nullptr;
+
     GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
 
     instance                                    = ctx->instance;
@@ -2976,26 +3092,7 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
         tensor_2 = ggml_qnn_create_tensor(dst);
     }
 
-//#if GGMLQNN_DEBUG //uncomment this line and comment next line when troubleshooting mul_mat issue
-#if GGMLQNN_PRINT_OP_ADD_LOG
-    GGMLQNN_LOG_DEBUG("call %s in dev %s\n", __func__, ctx->name);
-    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-          src0->name,
-          src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
-          src0->nb[0], src0->nb[1], src0->nb[2]);
-    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-          src1->name,
-          src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
-          src1->nb[0], src1->nb[1], src1->nb[2]);
-    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-          dst->name,
-          dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
-          dst->nb[1], dst->nb[2]);
-    GGMLQNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
-    GGMLQNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0));
-    GGMLQNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1));
-    GGMLQNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2));
-#endif
+    print_tensors_info(__func__, ctx, src0, src1, dst);
 
     QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
     QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
@@ -3013,51 +3110,7 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
         graph_name = map_entry;
         GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
         if (ctx->device == QNN_BACKEND_NPU) {
-            QnnHtpGraph_CustomConfig_t hvx_config;
-            hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
-            hvx_config.numHvxThreads = 4;
-            QnnGraph_Config_t graph_hvx_config;
-            graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-            graph_hvx_config.customConfig = &hvx_config;
-
-            QnnHtpGraph_CustomConfig_t dlbc_config;
-            dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
-            dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
-            dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
-            QnnGraph_Config_t graph_dlbc_config;
-            graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-            graph_dlbc_config.customConfig = &dlbc_config;
-
-            QnnHtpGraph_CustomConfig_t opt_config;
-            opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
-            opt_config.optimizationOption.floatValue = 3;    // 1 or 3
-            QnnGraph_Config_t graph_opt_config;
-            graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-            graph_opt_config.customConfig = &opt_config;
-
-            QnnHtpGraph_CustomConfig_t vtcm_config;
-            vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
-            vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
-            QnnGraph_Config_t graph_vtcm_config;
-            graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-            graph_vtcm_config.customConfig = &vtcm_config;
-
-            QnnHtpGraph_CustomConfig_t precision_config;
-            precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
-            precision_config.precision = QNN_PRECISION_FLOAT16;
-            QnnGraph_Config_t graph_precision_config;
-            graph_precision_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-            graph_precision_config.customConfig = &precision_config;
-
-            const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config,
-                                                        &graph_dlbc_config,
-                                                        &graph_vtcm_config,
-                                                        &graph_opt_config,
-                                                        &graph_precision_config,
-                                                        NULL};
-            error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
-                                                  graph_name.c_str(),
-                                                  p_graphconfig, &graph_handle);
+            error = create_htp_graph(ctx, graph_name, &graph_handle);
         } else {
             error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
                                                   graph_name.c_str(),
@@ -3067,23 +3120,45 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
             GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
             return;
         }
-        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("error = %d\n", error);
+
+        if (instance->enalbe_qnn_rpc()) {
+            if (ctx->device == QNN_BACKEND_NPU) { // QNN RPC feature only available for NPU backend
+                QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+                QNN_VER_PTR(*tensor_0)->clientBuf = {.data=nullptr, .dataSize=0};
+
+                QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+                QNN_VER_PTR(*tensor_1)->clientBuf = {.data=nullptr, .dataSize=0};
+
+                QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+                QNN_VER_PTR(*tensor_2)->clientBuf = {.data=nullptr, .dataSize=0};
+            }
         }
+
+        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0);
+        CHECK_QNN_API(error);
         error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("error = %d\n", error);
-        }
+        CHECK_QNN_API(error);
         error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("error = %d\n", error);
+        CHECK_QNN_API(error);
+
+        if (instance->enalbe_qnn_rpc()) {
+            if (ctx->device == QNN_BACKEND_NPU) { // QNN RPC feature only available for NPU backend
+                qnn_rpcbuffer_0 = create_rpc_buffer(instance, src0, tensor_0, true);
+                qnn_rpcbuffer_1 = create_rpc_buffer(instance, src1, tensor_1, true);
+                qnn_rpcbuffer_2 = create_rpc_buffer(instance, dst, tensor_2, false);
+                if (nullptr == qnn_rpcbuffer_0 || nullptr == qnn_rpcbuffer_1 ||
+                    nullptr == qnn_rpcbuffer_2) {
+                    GGMLQNN_LOG_INFO("create rpc buffer failure\n");
+                    //FIXME: potential memory leak althought it shouldn't happen
+                    return;
+                }
+            }
+        } else {
+            QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+            QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+            QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
         }
 
-        QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-        QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
-
         Qnn_Tensor_t tensor_inputs[] = {
                 *tensor_0,
                 *tensor_1
@@ -3105,42 +3180,69 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
                 }
         };
         error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("error = %d\n", error);
-        }
+        CHECK_QNN_API(error);
         error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("error = %d\n", error);
-        }
+        CHECK_QNN_API(error);
         error = qnn_raw_interface.graphExecute(graph_handle,
                                                tensor_inputs, 2,
                                                tensor_outputs, 1,
                                                nullptr, nullptr);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("error = %d\n", error);
+        CHECK_QNN_API(error);
+
+        if (instance->enalbe_qnn_rpc()) {
+            if (ctx->device == QNN_BACKEND_NPU) { // QNN RPC feature only available for NPU backend
+                uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_2)->memHandle));
+                GGMLQNN_LOG_INFO("qnn_rpcbuffer = %p\n", qnn_rpcbuffer);
+                if (nullptr != qnn_rpcbuffer) {
+                    memcpy(dst->data, qnn_rpcbuffer, ggml_nbytes(dst));
+                }
+            }
         }
+
         auto  graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
         instance->_qnn_graph_map[map_entry] = graph_item;
+
     } else {
+
         uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
                                          (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
         uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
                                          (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
         uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
                                          (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
-        QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
-        QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0);
-        QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
-        QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
-        QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1);
-        QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
-        QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
-        QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst);
-        QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
 
-        QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-        QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+        QNN_VER_PTR(*tensor_0)->dimensions  = dimensions_input_0;
+        QNN_VER_PTR(*tensor_0)->rank        = ggml_get_tensor_rank(src0);
+        QNN_VER_PTR(*tensor_0)->dataType    = src0_qnn_type;
+
+        QNN_VER_PTR(*tensor_1)->dimensions  = dimensions_input_1;
+        QNN_VER_PTR(*tensor_1)->rank        = ggml_get_tensor_rank(src1);
+        QNN_VER_PTR(*tensor_1)->dataType    = src1_qnn_type;
+
+        QNN_VER_PTR(*tensor_2)->dimensions  = dimensions_output;
+        QNN_VER_PTR(*tensor_2)->rank        = ggml_get_tensor_rank(dst);
+        QNN_VER_PTR(*tensor_2)->dataType    = dst_qnn_type;
+
+        if (instance->enalbe_qnn_rpc()) {
+            if (ctx->device == QNN_BACKEND_NPU) { // QNN RPC feature only available for NPU backend
+                //FIXME:why failure with test-backend-ops
+                uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_0)->memHandle));
+                GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_rpcbuffer_0);
+                if (nullptr != qnn_buffer_0) {
+                    memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
+                }
+
+                uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_1)->memHandle));
+                GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_rpcbuffer_1);
+                if (nullptr != qnn_buffer_1) {
+                    memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
+                }
+            }
+        } else {
+            QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+            QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+            QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,  ggml_get_tensor_data_size(dst)};
+        }
 
         Qnn_Tensor_t tensor_inputs[] = {
                 *tensor_0,
@@ -3156,6 +3258,15 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
         if (QNN_SUCCESS != error) {
             GGMLQNN_LOG_INFO("error = %d\n", error);
         }
+
+        if (instance->enalbe_qnn_rpc()) {
+            if (ctx->device == QNN_BACKEND_NPU) { // QNN RPC feature only available for NPU backend
+                //FIXME:why failure with test-backend-ops
+                uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_2)->memHandle));
+                if (nullptr != qnn_buffer_2)
+                    memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
+            }
+        }
     }
 
     //avoid memory leak in func free_qnn_tensor
@@ -3224,25 +3335,8 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         tensor_2 = ggml_qnn_create_tensor(dst);
     }
 
-#if GGMLQNN_DEBUG
-    GGMLQNN_LOG_DEBUG("call %s in dev %s\n", __func__, ctx->name);
-    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-          src0->name,
-          src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
-          src0->nb[0], src0->nb[1], src0->nb[2]);
-    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-          src1->name,
-          src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
-          src1->nb[0], src1->nb[1], src1->nb[2]);
-    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-          dst->name,
-          dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
-          dst->nb[1], dst->nb[2]);
-    GGMLQNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
-    GGMLQNN_LOG_DEBUG("tensor0 name %s", QNN_TENSOR_GET_NAME(tensor_0));
-    GGMLQNN_LOG_DEBUG("tensor1 name %s", QNN_TENSOR_GET_NAME(tensor_1));
-    GGMLQNN_LOG_DEBUG("tensor2 name %s", QNN_TENSOR_GET_NAME(tensor_2));
-#endif
+    print_tensors_info(__func__, ctx, src0, src1, dst);
+
     QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
     QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
     QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
@@ -3265,17 +3359,11 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
             return;
         }
         error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("error = %d\n", error);
-        }
+        CHECK_QNN_API(error);
         error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("error = %d\n", error);
-        }
+        CHECK_QNN_API(error);
         error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("error = %d\n", error);
-        }
+        CHECK_QNN_API(error);
 
         QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
         QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
@@ -3302,20 +3390,14 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
                 }
         };
         error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("error = %d\n", error);
-        }
+        CHECK_QNN_API(error);
         error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("error = %d\n", error);
-        }
+        CHECK_QNN_API(error);
         error = qnn_raw_interface.graphExecute(graph_handle,
                                                tensor_inputs, 2,
                                                tensor_outputs, 1,
                                                nullptr, nullptr);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("error = %d\n", error);
-        }
+        CHECK_QNN_API(error);
         auto  graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
         instance->_qnn_graph_map[map_entry] = graph_item;
     } else {
@@ -3350,9 +3432,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
                                               tensor_inputs, 2,
                                              tensor_outputs, 1,
                                          nullptr, nullptr);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("error = %d\n", error);
-        }
+        CHECK_QNN_API(error);
     }
 
     //avoid memory leak in func free_qnn_tensor
@@ -3699,7 +3779,7 @@ static ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_host_ptr(ggml_b
 
 static bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
     ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) dev->context;
-    return (ggml_qnn_can_handle_op(op, true));
+    return (ggml_qnn_can_handle_op(op));
 }
 
 static bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {

From b733ea7ab67f397a286d4add01a40612dae98a23 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 18 Feb 2025 17:35:04 +0800
Subject: [PATCH 067/200] ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 133 +++++++++++++++------------------
 1 file changed, 60 insertions(+), 73 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 810711e41acc7..6f2949333908e 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -1903,7 +1903,7 @@ class qnn_instance {
         return _qnn_mem_set.count(handle) != 0U;
     }
 
-    bool enalbe_qnn_rpc() {
+    bool enable_qnn_rpc() {
         return _enable_qnn_rpc;
     }
 
@@ -1989,6 +1989,9 @@ class qnn_instance {
     std::string _graph_name;
     QNNBackend _device_id;
     bool       _enable_qnn_rpc = false; //FIXME:unknown issue with QNN RPC feature
+
+    DISABLE_COPY(qnn_instance);
+    DISABLE_MOVE(qnn_instance);
 };
 
 std::mutex qnn_instance::_init_mutex;
@@ -3106,6 +3109,8 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
     uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
     uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
 
+    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
+
     if (!graph_initialized) {
         graph_name = map_entry;
         GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
@@ -3121,37 +3126,29 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
             return;
         }
 
-        if (instance->enalbe_qnn_rpc()) {
-            if (ctx->device == QNN_BACKEND_NPU) { // QNN RPC feature only available for NPU backend
-                QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-                QNN_VER_PTR(*tensor_0)->clientBuf = {.data=nullptr, .dataSize=0};
+        if (enable_npu_rpc) {
+            QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*tensor_0)->clientBuf = {.data=nullptr, .dataSize=0};
 
-                QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-                QNN_VER_PTR(*tensor_1)->clientBuf = {.data=nullptr, .dataSize=0};
+            QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*tensor_1)->clientBuf = {.data=nullptr, .dataSize=0};
 
-                QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-                QNN_VER_PTR(*tensor_2)->clientBuf = {.data=nullptr, .dataSize=0};
-            }
+            QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*tensor_2)->clientBuf = {.data=nullptr, .dataSize=0};
         }
 
-        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0);
-        CHECK_QNN_API(error);
-        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1);
-        CHECK_QNN_API(error);
-        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2);
-        CHECK_QNN_API(error);
-
-        if (instance->enalbe_qnn_rpc()) {
-            if (ctx->device == QNN_BACKEND_NPU) { // QNN RPC feature only available for NPU backend
-                qnn_rpcbuffer_0 = create_rpc_buffer(instance, src0, tensor_0, true);
-                qnn_rpcbuffer_1 = create_rpc_buffer(instance, src1, tensor_1, true);
-                qnn_rpcbuffer_2 = create_rpc_buffer(instance, dst, tensor_2, false);
-                if (nullptr == qnn_rpcbuffer_0 || nullptr == qnn_rpcbuffer_1 ||
-                    nullptr == qnn_rpcbuffer_2) {
-                    GGMLQNN_LOG_INFO("create rpc buffer failure\n");
-                    //FIXME: potential memory leak althought it shouldn't happen
-                    return;
-                }
+        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0));
+        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1));
+        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2));
+
+        if (enable_npu_rpc) {
+            qnn_rpcbuffer_0 = create_rpc_buffer(instance, src0, tensor_0, true);
+            qnn_rpcbuffer_1 = create_rpc_buffer(instance, src1, tensor_1, true);
+            qnn_rpcbuffer_2 = create_rpc_buffer(instance, dst, tensor_2, false);
+            if (nullptr == qnn_rpcbuffer_0 || nullptr == qnn_rpcbuffer_1 || nullptr == qnn_rpcbuffer_2) {
+                GGMLQNN_LOG_INFO("create rpc buffer failure\n");
+                //FIXME: potential memory leak althought it shouldn't happen
+                return;
             }
         } else {
             QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
@@ -3179,23 +3176,19 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
                         tensor_outputs
                 }
         };
-        error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
-        CHECK_QNN_API(error);
-        error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr);
-        CHECK_QNN_API(error);
+        CHECK_QNN_API(error = qnn_raw_interface.graphAddNode(graph_handle, op_config));
+        CHECK_QNN_API(error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
         error = qnn_raw_interface.graphExecute(graph_handle,
                                                tensor_inputs, 2,
                                                tensor_outputs, 1,
                                                nullptr, nullptr);
         CHECK_QNN_API(error);
 
-        if (instance->enalbe_qnn_rpc()) {
-            if (ctx->device == QNN_BACKEND_NPU) { // QNN RPC feature only available for NPU backend
-                uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_2)->memHandle));
-                GGMLQNN_LOG_INFO("qnn_rpcbuffer = %p\n", qnn_rpcbuffer);
-                if (nullptr != qnn_rpcbuffer) {
-                    memcpy(dst->data, qnn_rpcbuffer, ggml_nbytes(dst));
-                }
+        if (enable_npu_rpc) {
+            uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_2)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer = %p\n", qnn_rpcbuffer);
+            if (nullptr != qnn_rpcbuffer) {
+                memcpy(dst->data, qnn_rpcbuffer, ggml_nbytes(dst));
             }
         }
 
@@ -3223,25 +3216,23 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
         QNN_VER_PTR(*tensor_2)->rank        = ggml_get_tensor_rank(dst);
         QNN_VER_PTR(*tensor_2)->dataType    = dst_qnn_type;
 
-        if (instance->enalbe_qnn_rpc()) {
-            if (ctx->device == QNN_BACKEND_NPU) { // QNN RPC feature only available for NPU backend
-                //FIXME:why failure with test-backend-ops
-                uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_0)->memHandle));
-                GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_rpcbuffer_0);
-                if (nullptr != qnn_buffer_0) {
-                    memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
-                }
+        if (enable_npu_rpc) {
+            //FIXME:why failure with test-backend-ops
+            uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_0)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_rpcbuffer_0);
+            if (nullptr != qnn_buffer_0) {
+                memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
+            }
 
-                uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_1)->memHandle));
-                GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_rpcbuffer_1);
-                if (nullptr != qnn_buffer_1) {
-                    memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
-                }
+            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_1)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_rpcbuffer_1);
+            if (nullptr != qnn_buffer_1) {
+                memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
             }
         } else {
             QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
             QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-            QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data,  ggml_get_tensor_data_size(dst)};
+            QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
         }
 
         Qnn_Tensor_t tensor_inputs[] = {
@@ -3255,16 +3246,13 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
                                                tensor_inputs, 2,
                                                tensor_outputs, 1,
                                                nullptr, nullptr);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("error = %d\n", error);
-        }
+        CHECK_QNN_API(error);
 
-        if (instance->enalbe_qnn_rpc()) {
-            if (ctx->device == QNN_BACKEND_NPU) { // QNN RPC feature only available for NPU backend
-                //FIXME:why failure with test-backend-ops
-                uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_2)->memHandle));
-                if (nullptr != qnn_buffer_2)
-                    memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
+        if (enable_npu_rpc) {
+            //FIXME:why failure with test-backend-ops
+            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_2)->memHandle));
+            if (nullptr != qnn_buffer_2) {
+                memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
             }
         }
     }
@@ -3358,12 +3346,9 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
             GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
             return;
         }
-        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0);
-        CHECK_QNN_API(error);
-        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1);
-        CHECK_QNN_API(error);
-        error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2);
-        CHECK_QNN_API(error);
+        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0));
+        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1));
+        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2));
 
         QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
         QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
@@ -3389,10 +3374,8 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
                         tensor_outputs
                 }
         };
-        error = qnn_raw_interface.graphAddNode(graph_handle, op_config);
-        CHECK_QNN_API(error);
-        error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr);
-        CHECK_QNN_API(error);
+        CHECK_QNN_API(error = qnn_raw_interface.graphAddNode(graph_handle, op_config));
+        CHECK_QNN_API(error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
         error = qnn_raw_interface.graphExecute(graph_handle,
                                                tensor_inputs, 2,
                                                tensor_outputs, 1,
@@ -3400,7 +3383,9 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         CHECK_QNN_API(error);
         auto  graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
         instance->_qnn_graph_map[map_entry] = graph_item;
+
     } else {
+
         uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
                                          (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
         uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
@@ -3410,9 +3395,11 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
         QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0);
         QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
+
         QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
         QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1);
         QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
+
         QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
         QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst);
         QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
@@ -3656,7 +3643,7 @@ static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, s
     ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *) backend->context;
     GGML_UNUSED(ctx);
 
-    //GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
+    GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
         if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE

From 92303e590511acfa54cc3216f79a8fe69c526aee Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 19 Feb 2025 21:58:55 +0800
Subject: [PATCH 068/200] ggml-qnn: a concise approach to offload mulmat to QNN
 backend(sync from branch kantvai-ggmlqnn-npurpc,
 https://github.com/kantv-ai/llama.cpp/wiki/offloading-mulmat-to-QNN-backend)

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 626 ++++++++++++++++++++-------------
 1 file changed, 377 insertions(+), 249 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 6f2949333908e..a1aca7940bf4f 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -13,8 +13,9 @@
  * section-5 does ggml-qnn backend helper macro / data structure / function / class
  * section-6 does implementation of ggml-qnn backend according to ggml's backend subsystem
  *
- * currently only provide GGML_OP_ADD's QNN backend implementation:
- *    - GGML_OP_ADD: this is skeleton, can expand other ggml ops according to expertise
+ * currently only provide OPs' QNN backend implementation of GGML_OP_ADD & GGML_OP_MUL_MAT:
+ * - GGML_OP_ADD:    this is a simple skeleton, can expand other ggml ops according to expertise
+ * - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex op accordingly
  *
  * of course, can porting ggml-qnn to Windows on ARM as need.
  *
@@ -257,20 +258,25 @@ static void * ggmlqnn_host_malloc(size_t n) {
 // =================================================================================================
 //  section-4: QNN helper macro / data structure / function
 // =================================================================================================
-#define VALIDATE(value, status)                         \
-  do {                                                  \
-    status = value;                                     \
-    if (status != QNN_SUCCESS) {                        \
-      GGMLQNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value);       \
-      return status;                                    \
-    }                                                   \
+#define VALIDATE(value, status)                                                 \
+  do {                                                                          \
+    status = value;                                                             \
+    if (status != QNN_SUCCESS) {                                                \
+      GGMLQNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value);                    \
+      return status;                                                            \
+    }                                                                           \
   } while (0)
 
-#define CHECK_QNN_API(error)                            \
-    do {                                                \
-        if (QNN_SUCCESS != (error)) {                   \
-            GGMLQNN_LOG_INFO("error = %d\n", (error));  \
-        }                                               \
+#define CHECK_QNN_API(error, result)                                            \
+    do {                                                                        \
+        error = (result);                                                       \
+        if (QNN_SUCCESS != error) {                                             \
+            if (error == QNN_COMMON_ERROR_NOT_SUPPORTED) {                      \
+                GGMLQNN_LOG_WARN("WARNING: QNN feature/API not supported\n");   \
+            } else {                                                            \
+                GGMLQNN_LOG_INFO("QNN API error = %d(%s)\n", error, qnn_get_error_string(error));  \
+            }                                                                   \
+        }                                                                       \
     } while (0)
 
 #define VALIDATE_TENSOR_VERSION(tensor, err)            VALIDATE(validate_tensor_version(tensor), err)
@@ -823,9 +829,8 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
 
     uint32_t rank = QNN_TENSOR_GET_RANK(src);
     QNN_TENSOR_SET_RANK(dst, rank);
-    size_t dim_size       = rank * sizeof(uint32_t);
+    size_t dim_size       = GGML_MAX_DIMS * sizeof(uint32_t);
     uint32_t * dimensions = (uint32_t *)malloc(dim_size);
-    GGMLQNN_LOG_DEBUG("tensor dims %p", dimensions);
     if (dimensions == nullptr) {
         GGMLQNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src));
         return 1;
@@ -1025,6 +1030,9 @@ using _pfn_QnnSaver_initialize                          = decltype(QnnSaver_init
 using _pfn_QnnInterface_getProviders                    = decltype(QnnInterface_getProviders);
 using _pfn_QnnSystemInterface_getProviders              = decltype(QnnSystemInterface_getProviders);
 
+using qnn_res_t                                         = std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>;
+using qnn_tensors_t                                     = std::vector< Qnn_Tensor_t *>;
+
 enum class ggml_qnn_profile_level {
     profile_off     = 0,
     profile_basic   = 1,
@@ -1122,12 +1130,9 @@ struct ggml_backend_qnn_context {
     QNN_INTERFACE_VER_TYPE raw_interface;
     QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
     struct qcom_socinfo           socinfo;
-
-    //FIXME: should I move it from public member of class qnn_instance to here?
-    //std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *, Qnn_Tensor_t *, Qnn_Tensor_t *>> _qnn_graph_map;
 } ;
 
-//FIXME: the following global vars and three helper funcs should be removed in the future
+//TODO: the following global vars and three helper funcs should be removed in the future
 static int32_t  g_ggmltensor_idx    = 0;
 static void reset_idx() {
     g_ggmltensor_idx = 0;
@@ -1399,11 +1404,11 @@ static const char * ggml_get_type_name(ggml_type type) {
     return traits->type_name;
 }
 
-Qnn_Tensor_t * ggml_qnn_create_tensor(const ggml_tensor * tensor) {
+static Qnn_Tensor_t * ggml_qnn_create_compute_tensor(const ggml_tensor * tensor) {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     char tensor_name[GGML_MAX_NAME] = {0};
 
-    //FIXME:remove get_idx() and inc_idx() in the future but ensure the tensor name is unique
+    //TODO:remove get_idx() and inc_idx() in the future but ensure the tensor name is unique
     snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", get_idx());
     GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
     inc_idx();
@@ -1450,6 +1455,73 @@ Qnn_Tensor_t * ggml_qnn_create_tensor(const ggml_tensor * tensor) {
     return p_qnn_tensor;
 }
 
+static Qnn_Tensor_t * ggml_qnn_create_mulmat_tensor(const ggml_tensor * tensor, const char * name, Qnn_TensorType_t qnn_tensor_type,
+                                                    Qnn_DataType_t qnn_data_type, uint32_t rank, uint32_t * dims, void * data, uint32_t data_size) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    char tensor_name[GGML_MAX_NAME] = {0};
+
+    //TODO:remove get_idx() and inc_idx() in the future but ensure the tensor name is unique
+    if (nullptr != name) {
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", get_idx());
+    } else {
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, get_idx());
+    }
+    GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
+    inc_idx();
+
+    //there are different dimension order between ggml tensor and qnn tensor
+    uint32_t dimensions_transpose[GGML_MAX_DIMS] = {};
+    uint32_t * tensor_dims = nullptr;
+
+    if (nullptr != tensor) {
+        dimensions_transpose[0] = (uint32_t) tensor->ne[1];
+        dimensions_transpose[1] = (uint32_t) tensor->ne[0];
+        dimensions_transpose[2] = (uint32_t) tensor->ne[2];
+        dimensions_transpose[3] = (uint32_t) tensor->ne[3];
+        tensor_dims = dimensions_transpose;
+    }
+    if (nullptr != dims) {
+        tensor_dims = dims;
+    }
+
+    Qnn_Tensor_t qnn_tensor = {
+            .version= QNN_TENSOR_VERSION_1,
+            {.v1= {
+                    .id = 0,
+                    .name = tensor_name,
+                    .type = qnn_tensor_type,
+                    .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
+                    .dataType = qnn_data_type,
+                    .quantizeParams = {QNN_DEFINITION_UNDEFINED,
+                                       QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                                       {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}},
+                    .rank = rank,
+                    .dimensions = tensor_dims,
+                    .memType = QNN_TENSORMEMTYPE_RAW,
+                    {.clientBuf = {nullptr, 0}
+                    }
+            }
+            }
+    };
+    if (nullptr != name) {
+        QNN_VER_PTR(qnn_tensor)->name = name;
+    }
+    Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
+    if (nullptr == p_qnn_tensor) {
+        GGMLQNN_LOG_WARN("calloc failed");
+        return nullptr;
+    }
+    error = deep_copy_qnn_tensors(qnn_tensor, * p_qnn_tensor);
+    if (error != QNN_SUCCESS) {
+        free(p_qnn_tensor);
+        GGMLQNN_LOG_WARN("init tensor failed");
+        return  nullptr;
+    }
+    QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {data, data_size};
+
+    return p_qnn_tensor;
+}
+
 //TODO:
 // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
 static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
@@ -1908,7 +1980,7 @@ class qnn_instance {
     }
 
 public:
-    std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *, Qnn_Tensor_t *, Qnn_Tensor_t *>> _qnn_graph_map;
+    std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>> _qnn_graph_map;
 
 private:
     int load_system();
@@ -1988,7 +2060,7 @@ class qnn_instance {
 
     std::string _graph_name;
     QNNBackend _device_id;
-    bool       _enable_qnn_rpc = false; //FIXME:unknown issue with QNN RPC feature
+    bool       _enable_qnn_rpc = false; //TODO:unknown issue with QNN RPC feature
 
     DISABLE_COPY(qnn_instance);
     DISABLE_MOVE(qnn_instance);
@@ -2207,7 +2279,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     GGMLQNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str());
 
-    void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+    void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
     if (nullptr == lib_handle) {
         GGMLQNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror());
         return 1;
@@ -2223,7 +2295,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
 
     // get QnnInterface Providers
     std::uint32_t num_providers = 0;
-    const QnnInterface_t **provider_list = nullptr;
+    const QnnInterface_t ** provider_list = nullptr;
     error = get_providers(&provider_list, &num_providers);
     if (error != QNN_SUCCESS) {
         GGMLQNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error));
@@ -2282,8 +2354,9 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
     QnnSaver_Config_t backendid_cfg;
     backendid_cfg.option = QNN_SAVER_CONFIG_OPTION_BACKEND_ID;
     backendid_cfg.backendId = _backend_id;
-    const QnnSaver_Config_t *saverCfg[] = {&outputdir_cfg, &backendid_cfg, nullptr};
-    if (0 == QnnSaver_initialize(saverCfg)) {
+
+    const QnnSaver_Config_t * saver_cfg[] = {&outputdir_cfg, &backendid_cfg, nullptr};
+    if (0 == QnnSaver_initialize(saver_cfg)) {
         GGMLQNN_LOG_INFO("QnnSaver_initialize successfully");
     } else {
         GGMLQNN_LOG_WARN("QnnSaver_initialize failure");
@@ -2668,7 +2741,7 @@ int qnn_instance::qnn_finalize() {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
     GGMLQNN_LOG_DEBUG("enter %s\n", __func__);
-    //FIXME:should be removed in the future
+    //TODO:should be removed in the future
     reset_idx();
 
     free_rpcmem();
@@ -2971,6 +3044,20 @@ static void dump_tensors_info(const struct ggml_tensor * tensor) {
             tensor->nb[1], tensor->nb[2]);
 }
 
+//TODO: currently only support offloading 2D matrix to QNN backend
+static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, uint32_t * ggml_dimensions, uint32_t rank) {
+    if (rank > GGML_MAX_DIMS) {
+        GGMLQNN_LOG_WARN("invalid params");
+        return;
+    }
+    if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) {
+        GGMLQNN_LOG_WARN("invalid params");
+        return;
+    }
+    qnn_dimensions[0] = ggml_dimensions[1];
+    qnn_dimensions[1] = ggml_dimensions[0];
+}
+
 // =================================================================================================
 //  section-6: implementation of ggml-qnn backend
 // =================================================================================================
@@ -3010,7 +3097,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
             return false;
         
 #if GGMLQNN_PRINT_OP_ADD_LOG
-        dump_tensors_info(tensor);
+        //dump_tensors_info(tensor);
 #endif
         return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
                && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
@@ -3019,27 +3106,21 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
 
     if (tensor->op == GGML_OP_MUL_MAT) {
 #if GGMLQNN_PRINT_OP_MUL_MAT_LOG
-        dump_tensors_info(tensor);
+        //dump_tensors_info(tensor);
 #endif
-        //FIXME: 2048 is an experimental value between ASR inference and LLM inference because
-        //       it's better only offload big matrix to QNN backend
-        if (ne01 <= 2048) {
+        uint32_t src0_rank = ggml_get_tensor_rank(src0);
+        uint32_t src1_rank = ggml_get_tensor_rank(src1);
+
+        if ((src0_rank != 2) || (src1_rank != 2)) //TODO: only support offload 2D matrix mulmat to QNN backend
             return false;
-        }
-#if 0
-        //TODO: offload mul_mat to QNN backend
-        //need to process type trait in func ggml_qnn_mul_mat(...):
+
+        //TODO: support more data type in func ggml_qnn_mul_mat(...):
         //src0: q4_0, q6_k, ...
         //src1: f32
         //dst : f32
-        return (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
-                && (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16);
-#else
-        //fall back to ggml cpu backend
         return  (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
                 && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
                 && (src0->type == src1->type) && (src0->type == tensor->type);
-#endif
     }
 
     //TODO:for other op
@@ -3054,65 +3135,51 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
     bool graph_initialized                      = false;
     qnn_instance * instance                     = nullptr;
     ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *)backend->context;
-    std::string graph_name                      = "ggml_op_qnn_add";
     qnn_perf op_perf                            = qnn_perf("ggml_qnn_add");
     Qnn_GraphHandle_t graph_handle              = nullptr;
-    Qnn_Tensor_t * tensor_0                     = nullptr;
-    Qnn_Tensor_t * tensor_1                     = nullptr;
-    Qnn_Tensor_t * tensor_2                     = nullptr;
+    Qnn_Tensor_t * p_tensor0                    = nullptr;
+    Qnn_Tensor_t * p_tensor1                    = nullptr;
+    Qnn_Tensor_t * p_tensor2                    = nullptr;
     Qnn_Param_t qnn_params[]                    = {};
-    enum ggml_op ggmlop                         = GGML_OP_ADD;
-    Qnn_DataType_t src0_qnn_type                = QNN_DATATYPE_FLOAT_32;
-    Qnn_DataType_t src1_qnn_type                = QNN_DATATYPE_FLOAT_32;
-    Qnn_DataType_t dst_qnn_type                 = QNN_DATATYPE_FLOAT_32;
     const ggml_tensor * src0                    = op->src[0];
     const ggml_tensor * src1                    = op->src[1];
     ggml_tensor * dst                           = op;
 
-    uint8_t * qnn_rpcbuffer_0                   = nullptr;
-    uint8_t * qnn_rpcbuffer_1                   = nullptr;
-    uint8_t * qnn_rpcbuffer_2                   = nullptr;
-
     GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-
     instance                                    = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-
     op_perf.start();
 
-    std::string map_entry;
-    get_graph_key_from_op(op, map_entry);
-    if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) {
+    std::string graph_name;
+    get_graph_key_from_op(op, graph_name);
+    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
         graph_initialized = true;
-        auto & graph_item = instance->_qnn_graph_map[map_entry];
+        qnn_res_t & graph_item = instance->_qnn_graph_map[graph_name];
         graph_handle = std::get<0>(graph_item);
-        tensor_0     = std::get<1>(graph_item);
-        tensor_1     = std::get<2>(graph_item);
-        tensor_2     = std::get<3>(graph_item);
+        qnn_tensors_t & tensor = std::get<1>(graph_item);
+        p_tensor0     = tensor[0];
+        p_tensor1     = tensor[1];
+        p_tensor2     = tensor[2];
     } else {
-        tensor_0 = ggml_qnn_create_tensor(src0);
-        tensor_1 = ggml_qnn_create_tensor(src1);
-        tensor_2 = ggml_qnn_create_tensor(dst);
+        p_tensor0 = ggml_qnn_create_compute_tensor(src0);
+        p_tensor1 = ggml_qnn_create_compute_tensor(src1);
+        p_tensor2 = ggml_qnn_create_compute_tensor(dst);
     }
-
     print_tensors_info(__func__, ctx, src0, src1, dst);
 
-    QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
-
-    src0_qnn_type                   = qnn_datatype_from_ggml_datatype(src0->type);
-    src1_qnn_type                   = qnn_datatype_from_ggml_datatype(src1->type);
-    dst_qnn_type                    = qnn_datatype_from_ggml_datatype(dst->type);
+    //ensure QNN tensor has correct tensor type
+    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
 
-    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions;
-    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
-    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
+    //save the original dimensions of qnn tensors
+    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
+    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
+    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
 
     bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
 
     if (!graph_initialized) {
-        graph_name = map_entry;
         GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
         if (ctx->device == QNN_BACKEND_NPU) {
             error = create_htp_graph(ctx, graph_name, &graph_handle);
@@ -3127,44 +3194,44 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
         }
 
         if (enable_npu_rpc) {
-            QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*tensor_0)->clientBuf = {.data=nullptr, .dataSize=0};
+            QNN_VER_PTR(*p_tensor0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {.data=nullptr, .dataSize=0};
 
-            QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*tensor_1)->clientBuf = {.data=nullptr, .dataSize=0};
+            QNN_VER_PTR(*p_tensor1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {.data=nullptr, .dataSize=0};
 
-            QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*tensor_2)->clientBuf = {.data=nullptr, .dataSize=0};
+            QNN_VER_PTR(*p_tensor2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*p_tensor2)->clientBuf = {.data=nullptr, .dataSize=0};
         }
 
-        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0));
-        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1));
-        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
 
         if (enable_npu_rpc) {
-            qnn_rpcbuffer_0 = create_rpc_buffer(instance, src0, tensor_0, true);
-            qnn_rpcbuffer_1 = create_rpc_buffer(instance, src1, tensor_1, true);
-            qnn_rpcbuffer_2 = create_rpc_buffer(instance, dst, tensor_2, false);
+            uint8_t * qnn_rpcbuffer_0 = create_rpc_buffer(instance, src0, p_tensor0, true);
+            uint8_t * qnn_rpcbuffer_1 = create_rpc_buffer(instance, src1, p_tensor1, true);
+            uint8_t * qnn_rpcbuffer_2 = create_rpc_buffer(instance, dst, p_tensor2, false);
             if (nullptr == qnn_rpcbuffer_0 || nullptr == qnn_rpcbuffer_1 || nullptr == qnn_rpcbuffer_2) {
                 GGMLQNN_LOG_INFO("create rpc buffer failure\n");
-                //FIXME: potential memory leak althought it shouldn't happen
+                //TODO: potential memory leak although it shouldn't happen
                 return;
             }
         } else {
-            QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-            QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-            QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
         }
 
         Qnn_Tensor_t tensor_inputs[] = {
-                *tensor_0,
-                *tensor_1
+                *p_tensor0,
+                *p_tensor1
         };
         Qnn_Tensor_t tensor_outputs[] = {
-                *tensor_2
+                *p_tensor2
         };
         Qnn_OpConfig_t op_config = {
-                (Qnn_OpConfigVersion_t) 1, .v1 = {
+                QNN_OPCONFIG_VERSION_1, .v1 = {
                         "ggml_op_add",
                         QNN_OP_PACKAGE_NAME_QTI_AISW,
                         QNN_OP_ELEMENT_WISE_ADD,
@@ -3176,26 +3243,38 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
                         tensor_outputs
                 }
         };
-        CHECK_QNN_API(error = qnn_raw_interface.graphAddNode(graph_handle, op_config));
-        CHECK_QNN_API(error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-        error = qnn_raw_interface.graphExecute(graph_handle,
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
                                                tensor_inputs, 2,
                                                tensor_outputs, 1,
-                                               nullptr, nullptr);
-        CHECK_QNN_API(error);
+                                               nullptr, nullptr));
 
         if (enable_npu_rpc) {
-            uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_2)->memHandle));
+            uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
             GGMLQNN_LOG_INFO("qnn_rpcbuffer = %p\n", qnn_rpcbuffer);
             if (nullptr != qnn_rpcbuffer) {
                 memcpy(dst->data, qnn_rpcbuffer, ggml_nbytes(dst));
             }
         }
 
-        auto  graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
-        instance->_qnn_graph_map[map_entry] = graph_item;
+        qnn_tensors_t ggml_op_add_tensors;
+        ggml_op_add_tensors.reserve(3);
+        ggml_op_add_tensors.push_back(p_tensor0);
+        ggml_op_add_tensors.push_back(p_tensor1);
+        ggml_op_add_tensors.push_back(p_tensor2);
+
+        auto  graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
+        instance->_qnn_graph_map[graph_name] = graph_item;
 
     } else {
+        Qnn_DataType_t src0_qnn_type    = QNN_DATATYPE_FLOAT_32;
+        Qnn_DataType_t src1_qnn_type    = QNN_DATATYPE_FLOAT_32;
+        Qnn_DataType_t dst_qnn_type     = QNN_DATATYPE_FLOAT_32;
+
+        src0_qnn_type                   = qnn_datatype_from_ggml_datatype(src0->type);
+        src1_qnn_type                   = qnn_datatype_from_ggml_datatype(src1->type);
+        dst_qnn_type                    = qnn_datatype_from_ggml_datatype(dst->type);
 
         uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
                                          (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
@@ -3204,76 +3283,76 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
         uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
                                          (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
 
-        QNN_VER_PTR(*tensor_0)->dimensions  = dimensions_input_0;
-        QNN_VER_PTR(*tensor_0)->rank        = ggml_get_tensor_rank(src0);
-        QNN_VER_PTR(*tensor_0)->dataType    = src0_qnn_type;
+        QNN_VER_PTR(*p_tensor0)->dimensions  = dimensions_input_0;
+        QNN_VER_PTR(*p_tensor0)->rank        = ggml_get_tensor_rank(src0);
+        QNN_VER_PTR(*p_tensor0)->dataType    = src0_qnn_type;
 
-        QNN_VER_PTR(*tensor_1)->dimensions  = dimensions_input_1;
-        QNN_VER_PTR(*tensor_1)->rank        = ggml_get_tensor_rank(src1);
-        QNN_VER_PTR(*tensor_1)->dataType    = src1_qnn_type;
+        QNN_VER_PTR(*p_tensor1)->dimensions  = dimensions_input_1;
+        QNN_VER_PTR(*p_tensor1)->rank        = ggml_get_tensor_rank(src1);
+        QNN_VER_PTR(*p_tensor1)->dataType    = src1_qnn_type;
 
-        QNN_VER_PTR(*tensor_2)->dimensions  = dimensions_output;
-        QNN_VER_PTR(*tensor_2)->rank        = ggml_get_tensor_rank(dst);
-        QNN_VER_PTR(*tensor_2)->dataType    = dst_qnn_type;
+        QNN_VER_PTR(*p_tensor2)->dimensions  = dimensions_output;
+        QNN_VER_PTR(*p_tensor2)->rank        = ggml_get_tensor_rank(dst);
+        QNN_VER_PTR(*p_tensor2)->dataType    = dst_qnn_type;
 
         if (enable_npu_rpc) {
-            //FIXME:why failure with test-backend-ops
-            uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_0)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_rpcbuffer_0);
+            //TODO: NPU RPC feature will failed with test-backend-ops
+            uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
             if (nullptr != qnn_buffer_0) {
                 memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
             }
 
-            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_1)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_rpcbuffer_1);
+            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor1)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
             if (nullptr != qnn_buffer_1) {
                 memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
             }
         } else {
-            QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-            QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-            QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
         }
 
         Qnn_Tensor_t tensor_inputs[] = {
-                *tensor_0,
-                *tensor_1
+                *p_tensor0,
+                *p_tensor1
         };
         Qnn_Tensor_t tensor_outputs[] = {
-                *tensor_2
+                *p_tensor2
         };
-        error = qnn_raw_interface.graphExecute(graph_handle,
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
                                                tensor_inputs, 2,
                                                tensor_outputs, 1,
-                                               nullptr, nullptr);
-        CHECK_QNN_API(error);
+                                               nullptr, nullptr));
 
         if (enable_npu_rpc) {
-            //FIXME:why failure with test-backend-ops
-            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_2)->memHandle));
+            //TODO:NPU RPC feature will failed with test-backend-ops
+            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
             if (nullptr != qnn_buffer_2) {
                 memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
             }
         }
     }
 
-    //avoid memory leak in func free_qnn_tensor
-    QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions;
-    QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
-    QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions;
+    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
+    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
+    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
+    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
 #if GGMLQNN_PRINT_OP_ADD_LOG
     op_perf.info();
 #endif
 }
 
-//TODO:
 /*
- * the logic of ggml_qnn_mul_mat is similar to ggml_qnn_add,but type trait and matrix transpose are required
- * for offload mulmat to QNN backend, so it's a standalone function.
+ * the logic of ggml_qnn_mul_mat is similar to ggml_qnn_add but much more complicated than ggml_qnn_add,
+ * matrix transpose and type trait are required for offload mulmat to QNN backend,
+ * so it's a standalone function. accordingly, this is another typical skeleton for offload other
+ * ggml ops to QNN backend
  *
  * MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, we should focus on MUL_MAT.
  *
- * we have three kinds of MUL_MAT to compute:
+ * have three kinds of MUL_MAT to compute:
  * mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
  * mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
  * mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, ...) and src1 is F32, src0 -> f32 in src0', then src0' * src1
@@ -3284,148 +3363,200 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
     qnn_perf op_perf                            = qnn_perf("ggml_qnn_mul_mat");
     qnn_instance * instance                     = nullptr;
     ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *) backend->context;
-
-    std::string graph_name                      = "ggml_op_qnn_mul_mat";
     Qnn_GraphHandle_t graph_handle              = nullptr;
-    Qnn_Tensor_t * tensor_0                     = nullptr;
-    Qnn_Tensor_t * tensor_1                     = nullptr;
-    Qnn_Tensor_t * tensor_2                     = nullptr;
-
-    Qnn_Param_t qnn_params[]                    = {};
-
-    enum ggml_op ggmlop                         = GGML_OP_ADD;
-    Qnn_DataType_t src0_qnn_type                = QNN_DATATYPE_FLOAT_32;
-    Qnn_DataType_t src1_qnn_type                = QNN_DATATYPE_FLOAT_32;
-    Qnn_DataType_t dst_qnn_type                 = QNN_DATATYPE_FLOAT_32;
+    Qnn_Tensor_t * p_tensor0                    = nullptr;
+    Qnn_Tensor_t * p_tensor1                    = nullptr;
+    Qnn_Tensor_t * p_tensor2                    = nullptr;
+    Qnn_Tensor_t * p_param_tensor               = nullptr;
+    Qnn_Tensor_t * p_tensor2_transpose          = nullptr;
     const ggml_tensor * src0                    = op->src[0];
     const ggml_tensor * src1                    = op->src[1];
-    ggml_tensor * dst                           = op;
+    ggml_tensor       * dst                     = op;
 
     GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-
     instance                                    = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-
     op_perf.start();
 
-    std::string map_entry;
-    get_graph_key_from_op(op, map_entry);
-    if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) {
-        graph_initialized = true;
-        auto & graph_item = instance->_qnn_graph_map[map_entry];
-        graph_handle = std::get<0>(graph_item);
-        tensor_0     = std::get<1>(graph_item);
-        tensor_1     = std::get<2>(graph_item);
-        tensor_2     = std::get<3>(graph_item);
+    std::string graph_name;
+    get_graph_key_from_op(op, graph_name);
+    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+        graph_initialized       = true;
+        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
+        graph_handle            = std::get<0>(graph_item);
+        qnn_tensors_t & tensors = std::get<1>(graph_item);
+        p_tensor0               = tensors[0];
+        p_tensor1               = tensors[1];
+        p_tensor2               = tensors[2];
+        p_param_tensor          = tensors[3];
+        p_tensor2_transpose     = tensors[4];
     } else {
-        tensor_0 = ggml_qnn_create_tensor(src0);
-        tensor_1 = ggml_qnn_create_tensor(src1);
-        tensor_2 = ggml_qnn_create_tensor(dst);
+        p_tensor0 = ggml_qnn_create_mulmat_tensor(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        p_tensor1 = ggml_qnn_create_mulmat_tensor(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        p_tensor2 = ggml_qnn_create_mulmat_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
     }
 
     print_tensors_info(__func__, ctx, src0, src1, dst);
 
-    QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
-
-    src0_qnn_type                   = qnn_datatype_from_ggml_datatype(src0->type);
-    src1_qnn_type                   = qnn_datatype_from_ggml_datatype(src1->type);
-    dst_qnn_type                    = qnn_datatype_from_ggml_datatype(dst->type);
+    //ensure QNN tensor has correct tensor type
+    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
 
-    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions;
-    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
-    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
+    //save the original dimensions of qnn tensors
+    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
+    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
+    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
 
     if (!graph_initialized) {
-        graph_name = map_entry;
         GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
+        /*
+         there are two key-points in properly handling how to offload mulmat to the QNN backend in ggml-qnn
+         1. transpose
+            a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from:
+            struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
+            which like this:
+            +---+---+
+            | 0 | 1 |
+            +---+---+
+            | 2 | 3 |
+            +---+---+
+            | 4 | 5 |
+            +---+---+
+            with
+                ne[0] = 2
+                ne[1] = 3
+            there are different dimension order between ggml tensor and qnn tensor
+
+          2. QNN's MatMul can only support input tensors with rank >= 2
+
+        there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose operation when offloading mulmat to QNN backend.
+        */
+
+        //step-1: create qnn graph
         error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
                                               graph_name.c_str(), nullptr, &graph_handle);
         if (QNN_SUCCESS != error) {
             GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
             return;
         }
-        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0));
-        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1));
-        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2));
-
-        QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-        QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+        //step-2: create param tensor for mulmat of 2d matrix
+        uint32_t param_tensor_dims[] = {2};
+        uint32_t param_tensor_data[2] = {1, 0};
+        p_param_tensor = ggml_qnn_create_mulmat_tensor(nullptr, "param", QNN_TENSOR_TYPE_STATIC,QNN_DATATYPE_UINT_32, 1, param_tensor_dims, param_tensor_data, 8);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
+
+        //step-3: create compute tensor from ggml tensor
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
+
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+
+        //step-4: create a transpose tensor
+        uint32_t tensor2_transpose_dims[GGML_MAX_DIMS] = {};
+        p_tensor2_transpose = ggml_qnn_create_mulmat_tensor(dst,"transpose",QNN_TENSOR_TYPE_NATIVE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        get_qnn_dimensions_from_ggml_dimensions(tensor2_transpose_dims, tensor_2_dimensions,ggml_get_tensor_rank(dst));
+        //save pointer because the dimensions of tensor p_tensor2_transpose will be changed later
+        uint32_t * tensor2_dimensions_transpose = QNN_VER_PTR(*p_tensor2_transpose)->dimensions;
+        //update dimensions of tensor p_tensor2_transpose to make QNN SDK happy
+        QNN_VER_PTR(*p_tensor2_transpose)->dimensions = tensor2_transpose_dims;
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2_transpose));
+
+        //step-5: compose qnn graph: add mat_mul node
+        Qnn_Param_t out_0_params[] = {
+                {QNN_PARAMTYPE_SCALAR,
+                           QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
+                             .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}
+                }
+        };
 
-        Qnn_Tensor_t tensor_inputs[] = {
-                *tensor_0,
-                *tensor_1
+        Qnn_Tensor_t out_0_inputs[] = {*p_tensor0,*p_tensor1};
+        Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
+        Qnn_OpConfig_t out_0 = {
+                QNN_OPCONFIG_VERSION_1, .v1 =
+                        {"ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
+                         1,
+                         out_0_params,
+                         2,
+                         out_0_inputs,
+                         1,
+                         out_0_outputs}
         };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *tensor_2
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
+
+        //step-5: compose qnn graph: add transpose node
+        Qnn_Param_t out_trans1_0_params[] = {
+                {(Qnn_ParamType_t) 1,
+                 "perm", .tensorParam = *p_param_tensor
+                }
         };
-        Qnn_OpConfig_t op_config = {
-                (Qnn_OpConfigVersion_t) 1, .v1 = {
-                        "ggml_op_mul_mat",
-                        QNN_OP_PACKAGE_NAME_QTI_AISW,
-                        QNN_OP_MAT_MUL,
-                        0,
-                        qnn_params,
-                        2,
-                        tensor_inputs,
+        Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
+        Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
+        Qnn_OpConfig_t out_trans1_0 = {
+                QNN_OPCONFIG_VERSION_1,
+                .v1 =  {"ggmlqnn_mulmat_transpose_opconfig",
+                        "qti.aisw",
+                        QNN_OP_TRANSPOSE, 1,
+                        out_trans1_0_params,
                         1,
-                        tensor_outputs
-                }
+                        out_trans1_0_inputs,
+                        1,
+                        out_trans1_0_outputs}
         };
-        CHECK_QNN_API(error = qnn_raw_interface.graphAddNode(graph_handle, op_config));
-        CHECK_QNN_API(error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-        error = qnn_raw_interface.graphExecute(graph_handle,
-                                               tensor_inputs, 2,
-                                               tensor_outputs, 1,
-                                               nullptr, nullptr);
-        CHECK_QNN_API(error);
-        auto  graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
-        instance->_qnn_graph_map[map_entry] = graph_item;
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
+
+        //step-6: finalize qnn graph and execute qnn graph
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
+        Qnn_Tensor_t input_tensors_0[] = {*p_tensor0,*p_tensor1};
+        Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                               input_tensors_0, 2,
+                                               output_tensors_0, 1,
+                                               NULL, NULL));
+
+        qnn_tensors_t ggml_op_mulmat_tensors;
+        ggml_op_mulmat_tensors.reserve(5);
+        ggml_op_mulmat_tensors.push_back(p_tensor0);
+        ggml_op_mulmat_tensors.push_back(p_tensor1);
+        ggml_op_mulmat_tensors.push_back(p_tensor2);
+        ggml_op_mulmat_tensors.push_back(p_param_tensor);
+        ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
+
+        auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+        instance->_qnn_graph_map[graph_name] = graph_item;
+
+        //avoid cleanup these resource to make test_backend_ops happy
+        //free_qnn_tensor(p_param_tensor);
+        //restore pointer to avoid memory leak
+        QNN_VER_PTR(*p_tensor2_transpose)->dimensions = tensor2_dimensions_transpose;
+        //free_qnn_tensor(p_tensor2_transpose);
 
     } else {
 
-        uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-                                         (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
-        uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-                                         (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
-        uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-                                         (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
-        QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
-        QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0);
-        QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
-
-        QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
-        QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1);
-        QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
-
-        QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
-        QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst);
-        QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
-
-        QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-        QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
 
         Qnn_Tensor_t tensor_inputs[] = {
-                *tensor_0,
-                *tensor_1
+                *p_tensor0,
+                *p_tensor1
         };
         Qnn_Tensor_t tensor_outputs[] = {
-                *tensor_2
+                *p_tensor2
         };
-        error = qnn_raw_interface.graphExecute(graph_handle,
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
                                               tensor_inputs, 2,
                                              tensor_outputs, 1,
-                                         nullptr, nullptr);
-        CHECK_QNN_API(error);
+                                         nullptr, nullptr));
     }
 
-    //avoid memory leak in func free_qnn_tensor
-    QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions;
-    QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
-    QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions;
+    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
+    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
+    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
+    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
 
     op_perf.info();
 }
@@ -3608,21 +3739,18 @@ static void ggml_backend_qnn_free(ggml_backend_t backend) {
 
     qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance;
     if (instance != nullptr) {
-        std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *,
-                                        Qnn_Tensor_t *, Qnn_Tensor_t *>>::iterator graph_it;
+        std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector<Qnn_Tensor_t*>>>::iterator graph_it;
 
         for (graph_it = instance->_qnn_graph_map.begin();
              graph_it != instance->_qnn_graph_map.end(); graph_it++) {
             auto & graph_item = graph_it->second;
             Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item);
-            Qnn_Tensor_t *  tensor_0     = std::get<1>(graph_item);
-            Qnn_Tensor_t *  tensor_1     = std::get<2>(graph_item);
-            Qnn_Tensor_t *  tensor_2     = std::get<3>(graph_item);
+            qnn_tensors_t &  tensors = std::get<1>(graph_item);
+            for (auto tensor_it = tensors.begin(); tensor_it != tensors.end(); ++tensor_it) {
+                free_qnn_tensor(*tensor_it);
+            }
             GGML_UNUSED(graph_handle);
             GGMLQNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str());
-            free_qnn_tensor(tensor_0);
-            free_qnn_tensor(tensor_1);
-            free_qnn_tensor(tensor_2);
         }
         instance->_qnn_graph_map.clear();
 

From 2c041d38c870eda5f0473413b4806d2dc5805e24 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 20 Feb 2025 08:39:15 +0800
Subject: [PATCH 069/200] ggml-qnn: remove redundant codes

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 298 +++++++++++----------------------
 1 file changed, 97 insertions(+), 201 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index a1aca7940bf4f..37c947f412f1f 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -1404,58 +1404,69 @@ static const char * ggml_get_type_name(ggml_type type) {
     return traits->type_name;
 }
 
-static Qnn_Tensor_t * ggml_qnn_create_compute_tensor(const ggml_tensor * tensor) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    char tensor_name[GGML_MAX_NAME] = {0};
-
-    //TODO:remove get_idx() and inc_idx() in the future but ensure the tensor name is unique
-    snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", get_idx());
-    GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
-    inc_idx();
-
-    uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
-                             (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
-    Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
-    Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+static const char * get_ggml_type_name(ggml_type type) {
+    const auto * traits = ggml_get_type_traits(type);
+    return traits->type_name;
+}
 
-    if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
-        qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
-    } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
-        qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
-    }
-    Qnn_Tensor_t qnn_tensor = {
-            .version= QNN_TENSOR_VERSION_1,
-            {.v1= {
-                    .id = 0,
-                    .name = tensor_name,
-                    .type = qnn_tensor_type,
-                    .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
-                    .dataType = qnn_data_type,
-                    .quantizeParams = {QNN_DEFINITION_UNDEFINED,
-                                       QNN_QUANTIZATION_ENCODING_UNDEFINED,
-                                       {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}},
-                    .rank = ggml_get_tensor_rank(tensor),
-                    .dimensions = dimensions,
-                    .memType = QNN_TENSORMEMTYPE_RAW,
-                    {.clientBuf = {.data = nullptr,
-                            .dataSize = 0}}}}
-    };
-    Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
-    if (nullptr == p_qnn_tensor) {
-        GGMLQNN_LOG_WARN("calloc failed");
-        return nullptr;
+//TODO:
+// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
+static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
+    switch (ggmltype) {
+        case GGML_TYPE_F16:
+            return QNN_DATATYPE_FLOAT_16;
+        case GGML_TYPE_F32:
+            return QNN_DATATYPE_FLOAT_32;
+        case GGML_TYPE_I8:
+            return QNN_DATATYPE_INT_8;
+        case GGML_TYPE_Q8_0:
+            return QNN_DATATYPE_SFIXED_POINT_8;
+        case GGML_TYPE_Q4_0:
+            return QNN_DATATYPE_SFIXED_POINT_4;
+        default:
+            break;
     }
-    error = deep_copy_qnn_tensors(qnn_tensor, * p_qnn_tensor);
-    if (error != QNN_SUCCESS) {
-        free(p_qnn_tensor);
-        GGMLQNN_LOG_WARN("init tensor failed");
-        return  nullptr;
+    return QNN_DATATYPE_UNDEFINED;
+}
+
+//TODO:
+static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
+    switch (qnn_type) {
+        case QNN_DATATYPE_FLOAT_32:
+            return GGML_TYPE_F32;
+        case QNN_DATATYPE_FLOAT_16:
+            return GGML_TYPE_F16;
+        case QNN_DATATYPE_UINT_32:
+        case QNN_DATATYPE_INT_32:
+            return GGML_TYPE_I32;
+        case QNN_DATATYPE_INT_16:
+            return GGML_TYPE_I16;
+        case QNN_DATATYPE_INT_8:
+            return GGML_TYPE_I8;
+        case QNN_DATATYPE_SFIXED_POINT_8:
+            return GGML_TYPE_Q8_0;
+        case QNN_DATATYPE_SFIXED_POINT_4:
+            return GGML_TYPE_Q4_0;
+        default:
+            break;
     }
+    return GGML_TYPE_COUNT;
+}
 
-    return p_qnn_tensor;
+//TODO: add more ops
+static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) {
+    switch (ggmlop) {
+        case GGML_OP_ADD:
+            return QNN_OP_ELEMENT_WISE_ADD;
+        case GGML_OP_MUL_MAT:
+            return QNN_OP_MAT_MUL;
+        default:
+            break;
+    }
+    return nullptr;
 }
 
-static Qnn_Tensor_t * ggml_qnn_create_mulmat_tensor(const ggml_tensor * tensor, const char * name, Qnn_TensorType_t qnn_tensor_type,
+static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor, const char * name, Qnn_TensorType_t qnn_tensor_type,
                                                     Qnn_DataType_t qnn_data_type, uint32_t rank, uint32_t * dims, void * data, uint32_t data_size) {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     char tensor_name[GGML_MAX_NAME] = {0};
@@ -1480,6 +1491,7 @@ static Qnn_Tensor_t * ggml_qnn_create_mulmat_tensor(const ggml_tensor * tensor,
         dimensions_transpose[3] = (uint32_t) tensor->ne[3];
         tensor_dims = dimensions_transpose;
     }
+    //re-assign tensor_dims
     if (nullptr != dims) {
         tensor_dims = dims;
     }
@@ -1522,66 +1534,25 @@ static Qnn_Tensor_t * ggml_qnn_create_mulmat_tensor(const ggml_tensor * tensor,
     return p_qnn_tensor;
 }
 
-//TODO:
-// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
-static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
-    switch (ggmltype) {
-        case GGML_TYPE_F16:
-            return QNN_DATATYPE_FLOAT_16;
-        case GGML_TYPE_F32:
-            return QNN_DATATYPE_FLOAT_32;
-        case GGML_TYPE_I8:
-            return QNN_DATATYPE_INT_8;
-        case GGML_TYPE_Q8_0:
-            return QNN_DATATYPE_SFIXED_POINT_8;
-        case GGML_TYPE_Q4_0:
-            return QNN_DATATYPE_SFIXED_POINT_4;
-        default:
-            break;
-    }
-    return QNN_DATATYPE_UNDEFINED;
-}
+static Qnn_Tensor_t * ggml_qnn_create_compute_tensor(const ggml_tensor * tensor) {
+    uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
+                             (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
+    Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
+    Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
 
-//TODO:
-static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
-    switch (qnn_type) {
-        case QNN_DATATYPE_FLOAT_32:
-            return GGML_TYPE_F32;
-        case QNN_DATATYPE_FLOAT_16:
-            return GGML_TYPE_F16;
-        case QNN_DATATYPE_UINT_32:
-        case QNN_DATATYPE_INT_32:
-            return GGML_TYPE_I32;
-        case QNN_DATATYPE_INT_16:
-            return GGML_TYPE_I16;
-        case QNN_DATATYPE_INT_8:
-            return GGML_TYPE_I8;
-        case QNN_DATATYPE_SFIXED_POINT_8:
-            return GGML_TYPE_Q8_0;
-        case QNN_DATATYPE_SFIXED_POINT_4:
-            return GGML_TYPE_Q4_0;
-        default:
-            break;
+    if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
+        qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+    } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
+        qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
     }
-    return GGML_TYPE_COUNT;
-}
 
-//TODO: add more ops
-static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) {
-    switch (ggmlop) {
-        case GGML_OP_ADD:
-            return QNN_OP_ELEMENT_WISE_ADD;
-        case GGML_OP_MUL_MAT:
-            return QNN_OP_MAT_MUL;
-        default:
-            break;
-    }
-    return nullptr;
-}
+    qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type);
+    Qnn_Tensor_t * p_qnn_tensor = ggml_qnn_create_general_tensor(tensor, nullptr,
+                                  qnn_tensor_type, qnn_data_type,
+                             ggml_n_dims(tensor), dimensions,
+                             nullptr, 0);
 
-static const char * get_ggml_type_name(ggml_type type) {
-    const auto * traits = ggml_get_type_traits(type);
-    return traits->type_name;
+    return p_qnn_tensor;
 }
 
 static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) {
@@ -1865,7 +1836,7 @@ class qnn_instance {
                        uint8_t do_node_validation = 1,
                        const QnnGraph_Config_t ** graph_configs = nullptr
     );
-    int init_qnn_graph(const std::string &graph_name, QNNBackend device, size_t vtcm_size_in_mb);
+    int init_qnn_graph(const std::string &graph_name, QNNBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
 
     int finalize_qnn_graph();
 
@@ -2813,7 +2784,7 @@ int qnn_instance::qnn_finalize() {
     return ret_status;
 }
 
-int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb) {
+int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb, size_t hvx_threads) {
     _graph_name = graph_name;
     _device_id = device;
 
@@ -2824,7 +2795,7 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend devi
     if (device == QNN_BACKEND_NPU) {
         QnnHtpGraph_CustomConfig_t hvx_config;
         hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
-        hvx_config.numHvxThreads = 8;
+        hvx_config.numHvxThreads = hvx_threads;
         QnnGraph_Config_t graph_hvx_config;
         graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
         graph_hvx_config.customConfig = &hvx_config;
@@ -2940,65 +2911,11 @@ static uint8_t * create_rpc_buffer(qnn_instance * instance, const ggml_tensor *
     return qnn_rpcbuffer;
 }
 
-static Qnn_ErrorHandle_t create_htp_graph(ggml_backend_qnn_context * ctx, const std::string & graph_name, Qnn_GraphHandle_t * graph_handle) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    if (nullptr == ctx)
-        return QNN_MIN_ERROR_COMMON;
-
-    qnn_instance * instance = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-
-    QnnHtpGraph_CustomConfig_t hvx_config;
-    hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
-    hvx_config.numHvxThreads = 4;
-    QnnGraph_Config_t graph_hvx_config;
-    graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-    graph_hvx_config.customConfig = &hvx_config;
-
-    QnnHtpGraph_CustomConfig_t dlbc_config;
-    dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
-    dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
-    dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
-    QnnGraph_Config_t graph_dlbc_config;
-    graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-    graph_dlbc_config.customConfig = &dlbc_config;
-
-    QnnHtpGraph_CustomConfig_t opt_config;
-    opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
-    opt_config.optimizationOption.floatValue = 3;    // 1 or 3
-    QnnGraph_Config_t graph_opt_config;
-    graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-    graph_opt_config.customConfig = &opt_config;
-
-    QnnHtpGraph_CustomConfig_t vtcm_config;
-    vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
-    vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
-    QnnGraph_Config_t graph_vtcm_config;
-    graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-    graph_vtcm_config.customConfig = &vtcm_config;
-
-    QnnHtpGraph_CustomConfig_t precision_config;
-    precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
-    precision_config.precision = QNN_PRECISION_FLOAT16;
-    QnnGraph_Config_t graph_precision_config;
-    graph_precision_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-    graph_precision_config.customConfig = &precision_config;
-
-    const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config,
-                                                 &graph_dlbc_config,
-                                                 &graph_vtcm_config,
-                                                 &graph_opt_config,
-                                                 &graph_precision_config,
-                                                 NULL};
-    error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
-                                          graph_name.c_str(),
-                                          p_graphconfig, graph_handle);
-    return error;
-}
-
 static void print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     //skip sanity check of params
-    GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
+    if (nullptr != func_name && nullptr != ctx) {
+        GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
+    }
     GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
                       src0->name,
                       src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
@@ -3019,29 +2936,14 @@ static void print_tensors_info(const char * func_name, ggml_backend_qnn_context
 
 static void dump_tensors_info(const struct ggml_tensor * tensor) {
     //skip sanity check of params
-    struct ggml_tensor * src0 = tensor->src[0];
+    const struct ggml_tensor * src0 = tensor->src[0];
     struct ggml_tensor * src1 = tensor->src[1];
+    struct ggml_tensor * dst  = const_cast<ggml_tensor *>(tensor);
     GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op),
                       ggml_type_name(tensor->type));
     GGMLQNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type));
     GGMLQNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type));
-    GGMLQNN_LOG_DEBUG(
-            "src0 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-            src0->name,
-            src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
-            src0->nb[0], src0->nb[1], src0->nb[2]);
-    GGMLQNN_LOG_DEBUG(
-            "src1 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-            src1->name,
-            src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
-            src1->nb[0], src1->nb[1], src1->nb[2]);
-    GGMLQNN_LOG_DEBUG(
-            "     %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-            tensor->name,
-            tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1],
-            tensor->ne[2],
-            tensor->nb[0],
-            tensor->nb[1], tensor->nb[2]);
+    print_tensors_info(nullptr, nullptr, src0, src1, dst);
 }
 
 //TODO: currently only support offloading 2D matrix to QNN backend
@@ -3089,25 +2991,20 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     int64_t ne1 = tensor->ne[1];
 
     if (tensor->op == GGML_OP_ADD) {
+        //dump_tensors_info(tensor);
         if (!ggml_are_same_shape(src0, src1)) {
             return false;
         }
 
         if (ne00 < 32)
             return false;
-        
-#if GGMLQNN_PRINT_OP_ADD_LOG
-        //dump_tensors_info(tensor);
-#endif
+
         return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
                && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
-
     }
 
     if (tensor->op == GGML_OP_MUL_MAT) {
-#if GGMLQNN_PRINT_OP_MUL_MAT_LOG
-        //dump_tensors_info(tensor);
-#endif
+        dump_tensors_info(tensor);
         uint32_t src0_rank = ggml_get_tensor_rank(src0);
         uint32_t src1_rank = ggml_get_tensor_rank(src1);
 
@@ -3181,17 +3078,12 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
 
     if (!graph_initialized) {
         GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
-        if (ctx->device == QNN_BACKEND_NPU) {
-            error = create_htp_graph(ctx, graph_name, &graph_handle);
-        } else {
-            error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
-                                                  graph_name.c_str(),
-                                                  nullptr, &graph_handle);
-        }
+        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8);
         if (QNN_SUCCESS != error) {
             GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
             return;
         }
+        graph_handle = instance->get_qnn_graph_handle();
 
         if (enable_npu_rpc) {
             QNN_VER_PTR(*p_tensor0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
@@ -3391,9 +3283,9 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         p_param_tensor          = tensors[3];
         p_tensor2_transpose     = tensors[4];
     } else {
-        p_tensor0 = ggml_qnn_create_mulmat_tensor(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
-        p_tensor1 = ggml_qnn_create_mulmat_tensor(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
-        p_tensor2 = ggml_qnn_create_mulmat_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        p_tensor0 = ggml_qnn_create_general_tensor(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        p_tensor1 = ggml_qnn_create_general_tensor(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        p_tensor2 = ggml_qnn_create_general_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
     }
 
     print_tensors_info(__func__, ctx, src0, src1, dst);
@@ -3443,7 +3335,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         //step-2: create param tensor for mulmat of 2d matrix
         uint32_t param_tensor_dims[] = {2};
         uint32_t param_tensor_data[2] = {1, 0};
-        p_param_tensor = ggml_qnn_create_mulmat_tensor(nullptr, "param", QNN_TENSOR_TYPE_STATIC,QNN_DATATYPE_UINT_32, 1, param_tensor_dims, param_tensor_data, 8);
+        p_param_tensor = ggml_qnn_create_general_tensor(nullptr, "param", QNN_TENSOR_TYPE_STATIC,QNN_DATATYPE_UINT_32, 1, param_tensor_dims, param_tensor_data, 8);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
 
         //step-3: create compute tensor from ggml tensor
@@ -3457,8 +3349,8 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
 
         //step-4: create a transpose tensor
         uint32_t tensor2_transpose_dims[GGML_MAX_DIMS] = {};
-        p_tensor2_transpose = ggml_qnn_create_mulmat_tensor(dst,"transpose",QNN_TENSOR_TYPE_NATIVE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
-        get_qnn_dimensions_from_ggml_dimensions(tensor2_transpose_dims, tensor_2_dimensions,ggml_get_tensor_rank(dst));
+        p_tensor2_transpose = ggml_qnn_create_general_tensor(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        get_qnn_dimensions_from_ggml_dimensions(tensor2_transpose_dims, tensor_2_dimensions, ggml_get_tensor_rank(dst));
         //save pointer because the dimensions of tensor p_tensor2_transpose will be changed later
         uint32_t * tensor2_dimensions_transpose = QNN_VER_PTR(*p_tensor2_transpose)->dimensions;
         //update dimensions of tensor p_tensor2_transpose to make QNN SDK happy
@@ -3547,6 +3439,10 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         Qnn_Tensor_t tensor_outputs[] = {
                 *p_tensor2
         };
+        //attention:
+        // this is the second technical approach of "how to utilize the Hexagon NPU maximally" through
+        // QNN SDK, details could be found at
+        // https://github.com/kantv-ai/llama.cpp/wiki/mapping-ggml-compute-graph-to-QNN-compute-graph
         CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
                                               tensor_inputs, 2,
                                              tensor_outputs, 1,

From 49078108f90cb58669b6294ef1ccced7c1fe499d Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 20 Feb 2025 12:33:38 +0800
Subject: [PATCH 070/200] ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 282 ++++++++++++++++++++++-----------
 1 file changed, 186 insertions(+), 96 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 37c947f412f1f..ee273503b9e8a 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -13,9 +13,10 @@
  * section-5 does ggml-qnn backend helper macro / data structure / function / class
  * section-6 does implementation of ggml-qnn backend according to ggml's backend subsystem
  *
- * currently only provide OPs' QNN backend implementation of GGML_OP_ADD & GGML_OP_MUL_MAT:
+ * currently provide following ggml ops' QNN backend implementation:
  * - GGML_OP_ADD:    this is a simple skeleton, can expand other ggml ops according to expertise
- * - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex op accordingly
+ * - GGML_OP_MUL:    this is a simple skeleton, can expand other ggml ops according to expertise
+ * - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex ggml ops accordingly
  *
  * of course, can porting ggml-qnn to Windows on ARM as need.
  *
@@ -95,7 +96,6 @@
 #include "ggml-qnn.h"
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
-
 // =================================================================================================
 //  section-1: forward/external declaration
 // =================================================================================================
@@ -110,9 +110,9 @@ static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const
 // =================================================================================================
 #define GGMLQNN_DEBUG                           1  // for troubleshooting QNN backend
 #define GGML_QNN_LOGBUF_LEN                     4096
-#define ENABLE_QNNBACKEND_PERF                  1  // enable/disable op's perf info
+#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
 #define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
-#define GGMLQNN_PRINT_OP_ADD_LOG                1  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
+#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
 #define GGMLQNN_PRINT_OP_MUL_MAT_LOG            1
 
 #define GGMLQNN_LOG_ERROR(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG,  __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
@@ -1187,25 +1187,28 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .socinfo              = {}},
 };
 
-
 struct qnn_op_caps_t {
     const char * qnn_op_name = nullptr;
     const size_t input_param_count = 0;
     const char * qnn_param_name = nullptr;
 };
 
-static const qnn_op_caps_t kOpCaps[] = {
+static const qnn_op_caps_t k_op_caps[] = {
         {}, // GGML_OP_NONE
         {}, // GGML_OP_DUP
         {
                 // GGML_OP_ADD
-                QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name
-                2,                   // input_param_count
+                QNN_OP_ELEMENT_WISE_ADD,
+                2,
         },
         {}, // GGML_OP_ADD1
         {}, // GGML_OP_ACC
         {}, // GGML_OP_SUB
-        {}, // GGML_OP_MUL
+        {
+                // GGML_OP_MUL
+                QNN_OP_ELEMENT_WISE_MULTIPLY,
+                2,
+        },
         {}, // GGML_OP_DIV
         {}, // GGML_OP_SQR
         {}, // GGML_OP_SQRT
@@ -1227,8 +1230,8 @@ static const qnn_op_caps_t kOpCaps[] = {
         {}, // GGML_OP_GROUP_NORM
         {
                 // GGML_OP_MUL_MAT
-                QNN_OP_MAT_MUL,  // qnn_op_name
-                2,               // input_param_count
+                QNN_OP_MAT_MUL,
+                2,
         },
         {}, // GGML_OP_MUL_MAT_ID
         {}, // GGML_OP_OUT_PROD
@@ -1580,11 +1583,9 @@ static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & o
     output.append(buffer, len);
 }
 
-constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT;
-
 static size_t get_qnn_op_index(const ggml_tensor * tensor) {
     if (tensor->op == GGML_OP_UNARY) {
-        return kGgmlUnaryOpStart + ggml_get_unary_op(tensor);
+        return GGML_OP_COUNT + ggml_get_unary_op(tensor);
     }
 
     return tensor->op;
@@ -1592,8 +1593,8 @@ static size_t get_qnn_op_index(const ggml_tensor * tensor) {
 
 static size_t get_qnn_op_input_param_count(const ggml_tensor * op) {
     auto op_index = get_qnn_op_index(op);
-    GGML_ASSERT(op_index < std::size(kOpCaps));
-    return kOpCaps[op_index].input_param_count;
+    GGML_ASSERT(op_index < std::size(k_op_caps));
+    return k_op_caps[op_index].input_param_count;
 }
 
 static void get_graph_key_from_op(const ggml_tensor * op, std::string & output) {
@@ -1796,21 +1797,21 @@ class qnn_instance {
 
     int qnn_finalize();
 
-    const qnn_interface &get_qnn_interface() {
+    const qnn_interface & get_qnn_interface() {
         if (!_qnn_interface.is_loaded()) {
             GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
         }
         return _qnn_interface;
     }
 
-    const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() {
+    const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() {
         if (!_qnn_interface.is_loaded()) {
             GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
         }
         return _qnn_raw_interface;
     }
 
-    const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() {
+    const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() {
         if (!_qnn_interface.is_loaded()) {
             GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
         }
@@ -1836,7 +1837,7 @@ class qnn_instance {
                        uint8_t do_node_validation = 1,
                        const QnnGraph_Config_t ** graph_configs = nullptr
     );
-    int init_qnn_graph(const std::string &graph_name, QNNBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
+    int init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
 
     int finalize_qnn_graph();
 
@@ -1850,8 +1851,8 @@ class qnn_instance {
             return 1;
         }
 
-        QnnHtpDevice_Infrastructure_t *htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
-        QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra;
+        QnnHtpDevice_Infrastructure_t * htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
+        QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra;
         uint32_t power_configid = 1;
         uint32_t device_id = 0;
         uint32_t core_id = 0;
@@ -1925,6 +1926,7 @@ class qnn_instance {
     }
 
     size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
+    size_t get_rpcmem_usage() { return _rpcmem_usage; }
 
     int32_t rpcmem_to_fd(void * buf);
 
@@ -1950,6 +1952,32 @@ class qnn_instance {
         return _enable_qnn_rpc;
     }
 
+    void probe_device_meminfo() {
+        size_t candidate_size = 0;
+        uint8_t *rpc_buffer = nullptr;
+        const int SIZE_IN_MB = (1 << 20);
+        size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
+        size_t probe_counts = sizeof(probe_slots) / sizeof(size_t);
+        for (size_t idx = 0; idx < probe_counts; idx++) {
+            rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem_internal(probe_slots[idx] * SIZE_IN_MB, 4));
+            if (nullptr == rpc_buffer) {
+                GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx],
+                                  strerror(errno));
+                break;
+            } else {
+                candidate_size = probe_slots[idx];
+                free_rpcmem(rpc_buffer);
+                rpc_buffer = nullptr;
+            }
+        }
+        if (candidate_size > _rpcmem_capacity)
+            _rpcmem_capacity = candidate_size;
+
+        free_rpcmem();
+        _rpcmem_usage = 0;
+        GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
+    }
+
 public:
     std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>> _qnn_graph_map;
 
@@ -1969,6 +1997,8 @@ class qnn_instance {
     void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) {
         _qnn_raw_system_interface = raw_interface;
     }
+    
+    void * alloc_rpcmem_internal(size_t bytes, size_t alignment);
 
 private:
     static constexpr const int _required_num_providers = 1;
@@ -1987,7 +2017,7 @@ class qnn_instance {
 
     qnn_interface _qnn_interface;
 
-    void *_system_lib_handle = nullptr;
+    void * _system_lib_handle = nullptr;
 
     Qnn_GraphHandle_t _qnn_graph_handle = nullptr;
 
@@ -2013,7 +2043,6 @@ class qnn_instance {
     std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
     std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
 
-
     static std::mutex _init_mutex;
     static std::unordered_map<BackendIdType, void *> _loaded_lib_handle;
     static std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
@@ -2027,7 +2056,9 @@ class qnn_instance {
     pfn_rpc_mem_init  _pfn_rpc_mem_init;
     pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
     std::unordered_map<void *, void *> _rpcmem_store_map;
-    size_t                             _rpcmem_capacity = 512;
+    std::unordered_map<void *, size_t> _rpcmem_usage_map;
+    size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
+    size_t                             _rpcmem_usage    = 0;   // mempool usage in MBytes
 
     std::string _graph_name;
     QNNBackend _device_id;
@@ -2042,7 +2073,7 @@ std::unordered_map<qnn_instance::BackendIdType, void *> qnn_instance::_loaded_li
 std::unordered_map<std::string, qnn_instance::BackendIdType> qnn_instance::_lib_path_to_backend_id;
 std::unordered_map<qnn_instance::BackendIdType, const QnnInterface_t *> qnn_instance::_loaded_backend;
 
-void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
+void * qnn_instance::alloc_rpcmem_internal(size_t bytes, size_t alignment) {
     if (!_rpcmem_initialized) {
         GGMLQNN_LOG_WARN("rpc memory not initialized\n");
         return nullptr;
@@ -2062,17 +2093,50 @@ void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
         GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
         _pfn_rpc_mem_free(buf);
     }
+    return aligned_buf;
+}
+
+void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
+    if (_rpcmem_usage > (_rpcmem_capacity - 8)) { // reserve 8Mbytes in rpc mempool
+        GGMLQNN_LOG_WARN("rpc mempool capcaity: %d MB, usage: %d MB", _rpcmem_capacity, _rpcmem_usage);
+        return nullptr;
+    }
+
+    auto aligned_buf = alloc_rpcmem_internal(bytes, alignment);
+    if (nullptr == aligned_buf)
+        return nullptr;
+    _rpcmem_usage_map.insert(std::pair<void *, size_t>(aligned_buf, bytes));
 
+    size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
+    rpcmem_usage_in_bytes += bytes;
+    _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
     return aligned_buf;
 }
 
 void qnn_instance::free_rpcmem(void * buf) {
+    size_t rpcbuffer_size = 0;
     if (!_rpcmem_initialized) {
         GGMLQNN_LOG_WARN("rpc memory not initialized\n");
     } else if (0 == _rpcmem_store_map.count(buf)) {
         GGMLQNN_LOG_WARN("no allocated tensor\n");
     } else {
         GGMLQNN_LOG_DEBUG("free rpc mem %p", _rpcmem_store_map[buf]);
+        for (std::unordered_map<void *, size_t>::iterator it = _rpcmem_usage_map.begin();
+             it != _rpcmem_usage_map.end();
+             it++) {
+            void * rpcbuffer = it->first;
+            if (buf == rpcbuffer) {
+                rpcbuffer_size = it->second;
+                size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
+                rpcmem_usage_in_bytes -= rpcbuffer_size;
+                _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
+            }
+        }
+        if (rpcbuffer_size != 0) {
+            _rpcmem_usage_map.erase(buf);
+        } else {
+            GGMLQNN_LOG_WARN("it shouldn't happen, pls check why?");
+        }
         _pfn_rpc_mem_free(_rpcmem_store_map[buf]);
         _rpcmem_store_map.erase(buf);
     }
@@ -2094,6 +2158,8 @@ void qnn_instance::free_rpcmem() {
         _pfn_rpc_mem_free(rpcbuffer);
     }
     _rpcmem_store_map.clear();
+    _rpcmem_usage_map.clear();
+    _rpcmem_usage = 0;
 }
 
 int32_t qnn_instance::rpcmem_to_fd(void * buf) {
@@ -2177,7 +2243,11 @@ Qnn_MemHandle_t  qnn_instance::register_rpcmem(void * p_data, const uint32_t ran
     }
 
     GGMLQNN_LOG_DEBUG("mem_fd %d", mem_fd);
-    Qnn_MemDescriptor_t descriptor = {{rank, dimensions, nullptr}, data_type, QNN_MEM_TYPE_ION, {{mem_fd}}};
+    Qnn_MemDescriptor_t descriptor = {
+            {rank, dimensions, nullptr},
+            data_type, QNN_MEM_TYPE_ION,
+            {{mem_fd}}
+    };
     Qnn_MemHandle_t handle = nullptr;
     auto error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor,
             /*numDescriptors=*/1, &handle);
@@ -2318,7 +2388,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
     _loaded_lib_handle[backend_id] = lib_handle;
     _backend_id = backend_id;
 
-#if 0 // keep them here for further use
+#if 0 // leave them here for further use
     QnnSaver_Config_t outputdir_cfg;
     outputdir_cfg.option = QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY;
     outputdir_cfg.outputDirectory = "/data/local/tmp/";
@@ -2468,6 +2538,7 @@ int qnn_instance::unload_system() {
     return result;
 }
 
+#if GGMLQNN_PRINT_QNN_INTERNAL_LOG
 static void ggml_qnn_logcallback(const char * fmt,
                                  QnnLog_Level_t level,
                                  uint64_t timestamp,
@@ -2499,24 +2570,25 @@ static void ggml_qnn_logcallback(const char * fmt,
     }
 
     double ms = (double) timestamp / 1000000.0;
-
     {
         std::lock_guard<std::mutex> lock(log_mutex);
-
         memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN);
         vsnprintf(reinterpret_cast<char *const>(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp);
-#if GGMLQNN_PRINT_QNN_INTERNAL_LOG
         GGMLQNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
-#endif
     }
 }
+#else
+static void ggml_qnn_logcallback(const char * fmt,
+                                 QnnLog_Level_t level,
+                                 uint64_t timestamp,
+                                 va_list argp) {
+}
+#endif
 
 int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
     BackendIdType backend_id = QNN_BACKEND_ID_NULL;
     GGMLQNN_LOG_DEBUG("enter qni_init\n");
-
     const std::lock_guard<std::mutex> lock(_init_mutex);
-
     if (0 != load_system()) {
         GGMLQNN_LOG_WARN("can not load QNN system lib, pls check why?\n");
         return 1;
@@ -2542,9 +2614,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
               _loaded_lib_handle.count(backend_id));
         return 3;
     }
-
     _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]);
-
 #if 1
     _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle);
 #else
@@ -2671,25 +2741,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         }
         _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
 
-        size_t candidate_size = 0;
-        uint8_t * rpc_buffer = nullptr;
-        const int SIZE_IN_MB = (1 << 20);
-        size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
-        size_t probe_counts  = sizeof(probe_slots) / sizeof(size_t);
-        for (size_t idx = 0; idx < probe_counts; idx++) {
-            rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4));
-            if (nullptr == rpc_buffer) {
-                GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
-                break;
-            } else {
-                candidate_size = probe_slots[idx];
-                free_rpcmem(rpc_buffer);
-                rpc_buffer = nullptr;
-            }
-        }
-        if (candidate_size > _rpcmem_capacity)
-            _rpcmem_capacity = candidate_size;
-        GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
+        probe_device_meminfo();
 
         if (0 != init_htp_perfinfra()) {
             GGMLQNN_LOG_WARN("initialize HTP performance failure");
@@ -2963,6 +3015,7 @@ static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, u
 // =================================================================================================
 //  section-6: implementation of ggml-qnn backend
 // =================================================================================================
+//TODO: refine this function as it is a performance hotspot/bottleneck function
 static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     if (tensor->op == GGML_OP_NONE) {
         return true;
@@ -2973,7 +3026,9 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
         return false;
     }
 
-    bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT));
+    //TODO: support other op
+    bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT)
+            || (tensor->op == GGML_OP_MUL));
     if (!supported_op) {
         return false;
     }
@@ -2981,37 +3036,34 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     struct ggml_tensor * src0 = tensor->src[0];
     struct ggml_tensor * src1 = tensor->src[1];
 
-    int64_t ne00 = tensor->src[0]->ne[0];
-    int64_t ne01 = tensor->src[0]->ne[1];
+    const int64_t ne00 = tensor->src[0]->ne[0];
+    const int64_t ne01 = tensor->src[0]->ne[1];
 
-    int64_t ne10 = tensor->src[1]->ne[0];
-    int64_t ne11 = tensor->src[1]->ne[1];
+    const int64_t ne10 = tensor->src[1]->ne[0];
+    const int64_t ne11 = tensor->src[1]->ne[1];
 
-    int64_t ne0 = tensor->ne[0];
-    int64_t ne1 = tensor->ne[1];
+    const int64_t ne0 = tensor->ne[0];
+    const int64_t ne1 = tensor->ne[1];
+
+    const uint32_t src0_rank = ggml_get_tensor_rank(src0);
+    const uint32_t src1_rank = ggml_get_tensor_rank(src1);
 
     if (tensor->op == GGML_OP_ADD) {
         //dump_tensors_info(tensor);
         if (!ggml_are_same_shape(src0, src1)) {
             return false;
         }
-
         if (ne00 < 32)
             return false;
-
         return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
                && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
     }
 
     if (tensor->op == GGML_OP_MUL_MAT) {
-        dump_tensors_info(tensor);
-        uint32_t src0_rank = ggml_get_tensor_rank(src0);
-        uint32_t src1_rank = ggml_get_tensor_rank(src1);
-
+        //dump_tensors_info(tensor);
         if ((src0_rank != 2) || (src1_rank != 2)) //TODO: only support offload 2D matrix mulmat to QNN backend
             return false;
-
-        //TODO: support more data type in func ggml_qnn_mul_mat(...):
+        //TODO: support more data type in func ggml_qnn_mul_mat(...)
         //src0: q4_0, q6_k, ...
         //src1: f32
         //dst : f32
@@ -3020,19 +3072,30 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
                 && (src0->type == src1->type) && (src0->type == tensor->type);
     }
 
-    //TODO:for other op
+    if (tensor->op == GGML_OP_MUL) {
+        dump_tensors_info(tensor);
+        if ((src0_rank != 2) || (src1_rank != 2)) //TODO: only support offload 2D matrix mul to QNN backend
+            return false;
+        return  (src0->type == GGML_TYPE_F32)
+                && (src1->type == GGML_TYPE_F32)
+                && (tensor->type == src1->type);
+    }
+
     return  (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
             && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
             && (src0->type == src1->type) && (src0->type == tensor->type);
 }
 
-static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
+/*
+ * provide a general skeleton to offload ggml op to QNN backend: a single node contains 2 input
+ * tensor and 1 output tensor
+*/
+static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
     Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
     enum ggml_status result                     = GGML_STATUS_SUCCESS;
     bool graph_initialized                      = false;
     qnn_instance * instance                     = nullptr;
     ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *)backend->context;
-    qnn_perf op_perf                            = qnn_perf("ggml_qnn_add");
     Qnn_GraphHandle_t graph_handle              = nullptr;
     Qnn_Tensor_t * p_tensor0                    = nullptr;
     Qnn_Tensor_t * p_tensor1                    = nullptr;
@@ -3045,6 +3108,14 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
     GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
     instance                                    = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+
+    size_t qnn_op_index = get_qnn_op_index(op);
+    GGML_ASSERT(qnn_op_index < std::size(k_op_caps));
+    const char * qnn_op_name = k_op_caps[qnn_op_index].qnn_op_name;
+    std::string ggml_op_name_string = std::string("ggml_") + ggml_op_name(op->op);
+    const char * ggml_op_name = ggml_op_name_string.c_str();
+
+    qnn_perf op_perf                            = qnn_perf(ggml_op_name);
     op_perf.start();
 
     std::string graph_name;
@@ -3124,9 +3195,9 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
         };
         Qnn_OpConfig_t op_config = {
                 QNN_OPCONFIG_VERSION_1, .v1 = {
-                        "ggml_op_add",
+                        ggml_op_name,
                         QNN_OP_PACKAGE_NAME_QTI_AISW,
-                        QNN_OP_ELEMENT_WISE_ADD,
+                        qnn_op_name,
                         0,
                         qnn_params,
                         2,
@@ -3138,9 +3209,9 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
         CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                               tensor_inputs, 2,
-                                               tensor_outputs, 1,
-                                               nullptr, nullptr));
+                                                            tensor_inputs, 2,
+                                                            tensor_outputs, 1,
+                                                            nullptr, nullptr));
 
         if (enable_npu_rpc) {
             uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
@@ -3214,9 +3285,9 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
                 *p_tensor2
         };
         CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                               tensor_inputs, 2,
-                                               tensor_outputs, 1,
-                                               nullptr, nullptr));
+                                                            tensor_inputs, 2,
+                                                            tensor_outputs, 1,
+                                                            nullptr, nullptr));
 
         if (enable_npu_rpc) {
             //TODO:NPU RPC feature will failed with test-backend-ops
@@ -3231,18 +3302,17 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
     QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
     QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
     QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
-#if GGMLQNN_PRINT_OP_ADD_LOG
     op_perf.info();
-#endif
 }
 
 /*
- * the logic of ggml_qnn_mul_mat is similar to ggml_qnn_add but much more complicated than ggml_qnn_add,
+ * the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
+ * than ggml_qnn_general_node.
  * matrix transpose and type trait are required for offload mulmat to QNN backend,
  * so it's a standalone function. accordingly, this is another typical skeleton for offload other
  * ggml ops to QNN backend
  *
- * MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, we should focus on MUL_MAT.
+ * MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, should focus on MUL_MAT.
  *
  * have three kinds of MUL_MAT to compute:
  * mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
@@ -3288,7 +3358,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         p_tensor2 = ggml_qnn_create_general_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
     }
 
-    print_tensors_info(__func__, ctx, src0, src1, dst);
+    //print_tensors_info(__func__, ctx, src0, src1, dst);
 
     //ensure QNN tensor has correct tensor type
     QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
@@ -3444,8 +3514,8 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         // QNN SDK, details could be found at
         // https://github.com/kantv-ai/llama.cpp/wiki/mapping-ggml-compute-graph-to-QNN-compute-graph
         CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                              tensor_inputs, 2,
-                                             tensor_outputs, 1,
+                                         tensor_inputs, 2,
+                                         tensor_outputs, 1,
                                          nullptr, nullptr));
     }
 
@@ -3453,7 +3523,6 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
     QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
     QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
     QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
-
     op_perf.info();
 }
 
@@ -3462,13 +3531,17 @@ static bool ggml_qnn_compute_forward(ggml_backend_t backend, struct ggml_tensor
 
     switch (tensor->op) {
         case GGML_OP_ADD:
-            func = ggml_qnn_add;
+            func = ggml_qnn_general_node;
             break;
 
         case GGML_OP_MUL_MAT:
             func = ggml_qnn_mul_mat;
             break;
 
+        case GGML_OP_MUL:
+            func = ggml_qnn_general_node;
+            break;
+
         default:
             return false;
     }
@@ -3667,7 +3740,6 @@ static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, s
     ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *) backend->context;
     GGML_UNUSED(ctx);
 
-    GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
         if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE
@@ -3715,10 +3787,28 @@ static const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t d
 }
 
 static void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    //FIXME:this is NOT QNN device memory info
-    *free  = get_system_free_memory_in_bytes();
-    *total = get_system_total_memory_in_bytes();
-    GGML_UNUSED(dev);
+    struct ggml_backend_qnn_context * ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
+    if ((nullptr == ctx) || (ctx->device > QNN_BACKEND_GGML)) {
+        GGMLQNN_LOG_ERROR("pls check params");
+        *free = 0;
+        *total = 0;
+    }
+
+    if (QNN_BACKEND_CPU == ctx->device || QNN_BACKEND_GGML == ctx->device) {
+        *total = get_system_total_memory_in_bytes();
+        *free = get_system_free_memory_in_bytes();
+    } else if (QNN_BACKEND_GPU == ctx->device) {
+        //TODO: probe GPU info in Qualcomm Adreno GPU
+        *total = get_system_total_memory_in_bytes();
+        *free = get_system_free_memory_in_bytes();
+    } else if (QNN_BACKEND_NPU == ctx->device) {
+        size_t rpc_ion_memsize = ctx->instance->get_rpcmem_capacity();
+        size_t rpc_ion_usage = ctx->instance->get_rpcmem_usage();
+        GGMLQNN_LOG_DEBUG("rpc memsize %d", rpc_ion_memsize);
+        GGMLQNN_LOG_DEBUG("rpc usage %d", rpc_ion_usage);
+        *total = rpc_ion_memsize * (1 << 20);
+        *free = (rpc_ion_memsize - rpc_ion_usage) * (1 << 20);
+    }
 }
 
 static enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) {

From 55cd181dfbee1fd9ca5c9c0edbaa8fedf1ae938f Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 20 Feb 2025 22:20:15 +0800
Subject: [PATCH 071/200] ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 106 +++++++++++++++++++--------------
 1 file changed, 60 insertions(+), 46 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index ee273503b9e8a..9ef502421c051 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -1483,15 +1483,13 @@ static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor,
     GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
     inc_idx();
 
-    //there are different dimension order between ggml tensor and qnn tensor
     uint32_t dimensions_transpose[GGML_MAX_DIMS] = {};
     uint32_t * tensor_dims = nullptr;
-
     if (nullptr != tensor) {
-        dimensions_transpose[0] = (uint32_t) tensor->ne[1];
-        dimensions_transpose[1] = (uint32_t) tensor->ne[0];
-        dimensions_transpose[2] = (uint32_t) tensor->ne[2];
-        dimensions_transpose[3] = (uint32_t) tensor->ne[3];
+        //there are different dimension order between ggml tensor and qnn tensor
+        for (size_t idx = 0; idx < rank; idx++) {
+            dimensions_transpose[idx] = (uint32_t)tensor->ne[rank - 1 - idx];
+        }
         tensor_dims = dimensions_transpose;
     }
     //re-assign tensor_dims
@@ -2058,7 +2056,7 @@ class qnn_instance {
     std::unordered_map<void *, void *> _rpcmem_store_map;
     std::unordered_map<void *, size_t> _rpcmem_usage_map;
     size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
-    size_t                             _rpcmem_usage    = 0;   // mempool usage in MBytes
+    size_t                             _rpcmem_usage    = 0;   // mempool usage in Mbytes
 
     std::string _graph_name;
     QNNBackend _device_id;
@@ -2968,33 +2966,27 @@ static void print_tensors_info(const char * func_name, ggml_backend_qnn_context
     if (nullptr != func_name && nullptr != ctx) {
         GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
     }
-    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
                       src0->name,
-                      src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
-                      src0->nb[0], src0->nb[1], src0->nb[2]);
-    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                      src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+                      src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
+    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
                       src1->name,
-                      src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
-                      src1->nb[0], src1->nb[1], src1->nb[2]);
-    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                      src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+                      src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
+    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
                       dst->name,
-                      dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
-                      dst->nb[1], dst->nb[2]);
-    GGMLQNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
-    GGMLQNN_LOG_DEBUG("tensor0 name %s", src0->name);
-    GGMLQNN_LOG_DEBUG("tensor1 name %s", src1->name);
-    GGMLQNN_LOG_DEBUG("tensor2 name %s", dst->name);
+                      dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+                      dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
+    GGMLQNN_LOG_DEBUG("\n");
 }
 
-static void dump_tensors_info(const struct ggml_tensor * tensor) {
+static void dump_op_info(const struct ggml_tensor * tensor) {
     //skip sanity check of params
     const struct ggml_tensor * src0 = tensor->src[0];
-    struct ggml_tensor * src1 = tensor->src[1];
-    struct ggml_tensor * dst  = const_cast<ggml_tensor *>(tensor);
-    GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op),
-                      ggml_type_name(tensor->type));
-    GGMLQNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type));
-    GGMLQNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type));
+    struct ggml_tensor       * src1 = tensor->src[1];
+    struct ggml_tensor       * dst  = const_cast<ggml_tensor *>(tensor);
+    GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), ggml_type_name(tensor->type));
     print_tensors_info(nullptr, nullptr, src0, src1, dst);
 }
 
@@ -3008,8 +3000,13 @@ static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, u
         GGMLQNN_LOG_WARN("invalid params");
         return;
     }
-    qnn_dimensions[0] = ggml_dimensions[1];
-    qnn_dimensions[1] = ggml_dimensions[0];
+    for (size_t idx = 0; idx < GGML_MAX_DIMS; idx++)
+        qnn_dimensions[idx] = ggml_dimensions[idx];
+
+    if (rank >= 2) {
+        qnn_dimensions[rank - 1] = ggml_dimensions[rank - 2];
+        qnn_dimensions[rank - 2] = ggml_dimensions[rank - 1];
+    }
 }
 
 // =================================================================================================
@@ -3060,9 +3057,16 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     }
 
     if (tensor->op == GGML_OP_MUL_MAT) {
-        //dump_tensors_info(tensor);
-        if ((src0_rank != 2) || (src1_rank != 2)) //TODO: only support offload 2D matrix mulmat to QNN backend
+        dump_op_info(tensor);
+        if (src0_rank != src1_rank) // make QNN SDK happy
+            return false;
+        if (src0_rank < 2) // make QNN SDK happy
+            return false;
+        if (src0_rank > 3) //TODO: 4D matrix
             return false;
+        if ((src1->ne[2] != src0->ne[2]) || (src1->ne[3] != src0->ne[3])) // make QNN SDK happy
+            return false;
+
         //TODO: support more data type in func ggml_qnn_mul_mat(...)
         //src0: q4_0, q6_k, ...
         //src1: f32
@@ -3073,8 +3077,8 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     }
 
     if (tensor->op == GGML_OP_MUL) {
-        dump_tensors_info(tensor);
-        if ((src0_rank != 2) || (src1_rank != 2)) //TODO: only support offload 2D matrix mul to QNN backend
+        //dump_tensors_info(tensor);
+        if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix
             return false;
         return  (src0->type == GGML_TYPE_F32)
                 && (src1->type == GGML_TYPE_F32)
@@ -3340,6 +3344,11 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
     op_perf.start();
 
+    uint32_t src0_rank = ggml_get_tensor_rank(src0);
+    uint32_t src1_rank = ggml_get_tensor_rank(src1);
+    GGML_ASSERT(src0_rank == src1_rank);
+    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation
+
     std::string graph_name;
     get_graph_key_from_op(op, graph_name);
     if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
@@ -3353,12 +3362,12 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         p_param_tensor          = tensors[3];
         p_tensor2_transpose     = tensors[4];
     } else {
-        p_tensor0 = ggml_qnn_create_general_tensor(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
-        p_tensor1 = ggml_qnn_create_general_tensor(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
-        p_tensor2 = ggml_qnn_create_general_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        p_tensor0 = ggml_qnn_create_general_tensor(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor1 = ggml_qnn_create_general_tensor(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor2 = ggml_qnn_create_general_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
     }
 
-    //print_tensors_info(__func__, ctx, src0, src1, dst);
+    print_tensors_info(__func__, ctx, src0, src1, dst);
 
     //ensure QNN tensor has correct tensor type
     QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
@@ -3403,9 +3412,16 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
             return;
         }
         //step-2: create param tensor for mulmat of 2d matrix
-        uint32_t param_tensor_dims[] = {2};
-        uint32_t param_tensor_data[2] = {1, 0};
-        p_param_tensor = ggml_qnn_create_general_tensor(nullptr, "param", QNN_TENSOR_TYPE_STATIC,QNN_DATATYPE_UINT_32, 1, param_tensor_dims, param_tensor_data, 8);
+        const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
+                {0},
+                {1, 0},
+                {0, 2, 1},
+                {0, 1, 3, 2},
+        };
+        uint32_t param_tensor_dims[1]   = {src0_rank};
+        p_param_tensor = ggml_qnn_create_general_tensor(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32,
+                                                        1, param_tensor_dims,
+                                                        (void *) (param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
 
         //step-3: create compute tensor from ggml tensor
@@ -3419,7 +3435,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
 
         //step-4: create a transpose tensor
         uint32_t tensor2_transpose_dims[GGML_MAX_DIMS] = {};
-        p_tensor2_transpose = ggml_qnn_create_general_tensor(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        p_tensor2_transpose = ggml_qnn_create_general_tensor(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
         get_qnn_dimensions_from_ggml_dimensions(tensor2_transpose_dims, tensor_2_dimensions, ggml_get_tensor_rank(dst));
         //save pointer because the dimensions of tensor p_tensor2_transpose will be changed later
         uint32_t * tensor2_dimensions_transpose = QNN_VER_PTR(*p_tensor2_transpose)->dimensions;
@@ -3435,7 +3451,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
                 }
         };
 
-        Qnn_Tensor_t out_0_inputs[] = {*p_tensor0,*p_tensor1};
+        Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0,*p_tensor1};
         Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
         Qnn_OpConfig_t out_0 = {
                 QNN_OPCONFIG_VERSION_1, .v1 =
@@ -3455,7 +3471,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
                  "perm", .tensorParam = *p_param_tensor
                 }
         };
-        Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
+        Qnn_Tensor_t out_trans1_0_inputs[]  = {*p_tensor2_transpose};
         Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
         Qnn_OpConfig_t out_trans1_0 = {
                 QNN_OPCONFIG_VERSION_1,
@@ -3472,7 +3488,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
 
         //step-6: finalize qnn graph and execute qnn graph
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
-        Qnn_Tensor_t input_tensors_0[] = {*p_tensor0,*p_tensor1};
+        Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0,*p_tensor1};
         Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
         CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
                                                input_tensors_0, 2,
@@ -3495,9 +3511,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         //restore pointer to avoid memory leak
         QNN_VER_PTR(*p_tensor2_transpose)->dimensions = tensor2_dimensions_transpose;
         //free_qnn_tensor(p_tensor2_transpose);
-
     } else {
-
         QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
         QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
         QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};

From 14cad8d050f12d928a189b267aafafb75accbaed Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Fri, 21 Feb 2025 17:43:25 +0800
Subject: [PATCH 072/200] ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 9ef502421c051..e862b07a234eb 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -1132,8 +1132,8 @@ struct ggml_backend_qnn_context {
     struct qcom_socinfo           socinfo;
 } ;
 
-//TODO: the following global vars and three helper funcs should be removed in the future
-static int32_t  g_ggmltensor_idx    = 0;
+//the following helper funcs are used to ensure every QNN tensor name is unique
+static std::atomic<int32_t>  g_ggmltensor_idx(0);
 static void reset_idx() {
     g_ggmltensor_idx = 0;
 }
@@ -1143,7 +1143,7 @@ static void inc_idx() {
 }
 
 static int32_t get_idx() {
-    return g_ggmltensor_idx;
+    return g_ggmltensor_idx.load();
 }
 
 // file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/quantization.html
@@ -1474,7 +1474,7 @@ static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor,
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     char tensor_name[GGML_MAX_NAME] = {0};
 
-    //TODO:remove get_idx() and inc_idx() in the future but ensure the tensor name is unique
+    //ensure the tensor name is unique
     if (nullptr != name) {
         snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", get_idx());
     } else {
@@ -2762,7 +2762,6 @@ int qnn_instance::qnn_finalize() {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
     GGMLQNN_LOG_DEBUG("enter %s\n", __func__);
-    //TODO:should be removed in the future
     reset_idx();
 
     free_rpcmem();
@@ -3451,7 +3450,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
                 }
         };
 
-        Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0,*p_tensor1};
+        Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0, *p_tensor1};
         Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
         Qnn_OpConfig_t out_0 = {
                 QNN_OPCONFIG_VERSION_1, .v1 =
@@ -3488,7 +3487,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
 
         //step-6: finalize qnn graph and execute qnn graph
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
-        Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0,*p_tensor1};
+        Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0, *p_tensor1};
         Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
         CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
                                                input_tensors_0, 2,

From a054b4795486234230a1fb1d445e056909217a38 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Fri, 14 Feb 2025 21:50:53 +0800
Subject: [PATCH 073/200] ggml-qnn: add Qualcomm QNN backend for GGML

---
 ggml/src/ggml-qnn/CMakeLists.txt   |    8 +-
 ggml/src/ggml-qnn/ggml-qnn-impl.h  |  611 +++++++
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp |  842 ++++++++++
 ggml/src/ggml-qnn/ggml-qnn-ops.h   |   52 +
 ggml/src/ggml-qnn/ggml-qnn.cpp     | 2469 ++++++++--------------------
 scripts/build-run-android.sh       |   98 +-
 6 files changed, 2241 insertions(+), 1839 deletions(-)
 create mode 100644 ggml/src/ggml-qnn/ggml-qnn-impl.h
 create mode 100644 ggml/src/ggml-qnn/ggml-qnn-ops.cpp
 create mode 100644 ggml/src/ggml-qnn/ggml-qnn-ops.h

diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt
index 7bbb9be76b4f6..1156c98fbc9d7 100644
--- a/ggml/src/ggml-qnn/CMakeLists.txt
+++ b/ggml/src/ggml-qnn/CMakeLists.txt
@@ -4,12 +4,14 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     find_library(LOG_LIB log)
     set(QNN_LINK_LIBRARIES ${LOG_LIB})
     set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
+    set(QNN_DEFAULT_LIB_SEARCH_PATH "C:\\" CACHE STRING "customized library search path for QNN backend")
 else()
-    message(FATAL_ERROR "QNN now only available on Android")
+    message(FATAL_ERROR "QNN now only available on Android and Windows(Windows on ARM)")
 endif()
 
 if(NOT DEFINED GGML_QNN_SDK_PATH)
-    # try read from environment variable
+# try read from environment variable
     if(DEFINED ENV{QNN_SDK_PATH})
         set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH})
     else()
@@ -22,7 +24,7 @@ message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
 
 file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
-ggml_add_backend_library(ggml-qnn
+    ggml_add_backend_library(ggml-qnn
     ${QNN_SOURCES}
 )
 
diff --git a/ggml/src/ggml-qnn/ggml-qnn-impl.h b/ggml/src/ggml-qnn/ggml-qnn-impl.h
new file mode 100644
index 0000000000000..5a2fe5752a097
--- /dev/null
+++ b/ggml/src/ggml-qnn/ggml-qnn-impl.h
@@ -0,0 +1,611 @@
+/*
+* Copyright (c) 2023-2024 The ggml authors
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to
+* deal in the Software without restriction, including without limitation the
+* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+* sell copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*/
+#pragma once
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include <inttypes.h>
+#include <math.h>
+#include <time.h>
+#if defined(__ANDROID__) || defined(__linux__)
+#include <unistd.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/sysinfo.h>
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <map>
+#include <set>
+#include <tuple>
+#include <queue>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <chrono>
+#include <memory>
+#include <regex>
+#include <random>
+#include <functional>
+#include <unordered_map>
+#include <condition_variable>
+#include <cassert>
+#include <unordered_set>
+#include <utility>
+#include <stdatomic.h>
+#include <future>
+#if (defined __ANDROID__) || (defined ANDROID)
+#include "android/log.h"
+#endif
+
+#if defined(_WIN32)
+#include <wchar.h>
+#include <Windows.h>
+#endif
+
+#include "QnnTypes.h"
+#include "QnnCommon.h"
+#include "QnnContext.h"
+#include "QnnBackend.h"
+#include "QnnGraph.h"
+#include "QnnProperty.h"
+#include "QnnTensor.h"
+#include "QnnInterface.h"
+#include "Saver/QnnSaver.h"
+#include "System/QnnSystemInterface.h"
+#include "HTP/QnnHtpDevice.h"
+#include "HTP/QnnHtpGraph.h"
+
+#include "ggml-qnn.h"
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+
+class  qnn_instance;
+struct ggml_backend_qnn_context;
+void   ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
+
+#if 0//def NDEBUG
+#define GGMLQNN_DEBUG                           0
+#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
+#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
+#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
+#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            0
+#else
+#define GGMLQNN_DEBUG                           1  // for troubleshooting QNN backend
+#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
+#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
+#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
+#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            1
+#endif
+#define GGML_QNN_LOGBUF_LEN                     4096
+
+#define GGMLQNN_LOG_ERROR(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_WARN(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_INFO(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+
+#if GGMLQNN_DEBUG
+#define GGMLQNN_LOG_DEBUG(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+#define GGMLQNN_LOG_DEBUG(...)
+#endif
+
+#define CHECK_QNN_API(error, result)                                            \
+    do {                                                                        \
+        error = (result);                                                       \
+        if (QNN_SUCCESS != error) {                                             \
+            if (error == QNN_COMMON_ERROR_NOT_SUPPORTED) {                      \
+                GGMLQNN_LOG_WARN("WARNING: QNN feature/API not supported\n");   \
+            } else {                                                            \
+                GGMLQNN_LOG_INFO("QNN API error = %d(%s)\n", error, ggmlqnn_get_error_string(error));  \
+            }                                                                   \
+        }                                                                       \
+    } while (0)
+
+#define QNN_VER_PTR(x)                          (&((x).v1))
+#define RPCMEM_DEFAULT_FLAGS                    1
+#define RPCMEM_HEAP_ID_SYSTEM                   25
+
+#define DISABLE_COPY(class_name)                \
+    class_name(const class_name &) = delete;    \
+    void operator=(const class_name &) = delete
+
+#define DISABLE_MOVE(class_name)                \
+    class_name(class_name &&) = delete;         \
+    void operator=(class_name &&) = delete
+
+#define GQCGT                                   ggmlqnn_create_general_tensor
+
+#if defined(_WIN32)
+#define RTLD_GLOBAL 0x100
+#define RTLD_LOCAL  0x000
+#define RTLD_LAZY   0x000
+#define RTLD_NOW    0x001
+void *              dlopen(const char * filename, int flag);
+int                 dlclose(void * handle);
+void *              dlsym(void* handle, const char* name);
+const char *        dlerror(void);
+#endif
+
+using pfn_rpc_mem_init                          = void (*)(void);
+using pfn_rpc_mem_deinit                        = void (*)(void);
+using pfn_rpc_mem_alloc                         = void *(*)(int, uint32_t, int);
+using pfn_rpc_mem_free                          = void (*)(void *);
+using pfn_rpc_mem_to_fd                         = int (*)(void *);
+using _pfn_QnnSaver_initialize                  = decltype(QnnSaver_initialize);
+using _pfn_QnnInterface_getProviders            = decltype(QnnInterface_getProviders);
+using _pfn_QnnSystemInterface_getProviders      = decltype(QnnSystemInterface_getProviders);
+
+using qnn_res_t                                 = std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>;
+using qnn_tensors_t                             = std::vector< Qnn_Tensor_t *>;
+
+enum class ggml_qnn_profile_level {
+    profile_off     = 0,
+    profile_basic   = 1,
+    profile_detail  = 2
+};
+
+enum qcom_htp_arch {
+    NONE = 0,
+    V68 = 68,
+    V69 = 69,
+    V73 = 73,
+    V75 = 75,
+    V79 = 79,
+};
+
+enum qcom_chipset_soc_model {
+    UNKNOWN_SM = 0,
+    SM7450 = 41,  // v69, 7 Gen1
+    SM8350 = 30,  // v68, 888
+    SM8450 = 36,  // v69, SD 8 Gen 1
+    SM8475 = 42,  // v69, SD 8+ Gen 1
+    SM8550 = 43,  // v73, SD 8 Gen 2
+    SM8650 = 57,  // v75, SD 8 Gen 3
+    SM8750 = 69,  // v79, SD 8 Gen 4
+#if defined(_MSC_VER)
+    SC7280X     = 44,
+    SC8280X     = 37,
+    SC8380XP    = 60,
+#endif
+};
+
+struct qcom_socinfo {
+    uint32_t soc_model;
+    size_t htp_arch;
+    size_t vtcm_size_in_mb;
+    char soc_desc[GGML_MAX_NAME];
+};
+
+struct ggml_backend_qnn_context {
+    int device;
+    int threads;
+    char name[GGML_MAX_NAME];
+    char desc[GGML_MAX_NAME];
+    char lib[GGML_MAX_NAME];
+    qnn_instance * instance;
+    struct ggml_backend * backend;
+    QNN_INTERFACE_VER_TYPE raw_interface;
+    QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
+    struct qcom_socinfo           socinfo;
+
+    std::unique_ptr<char[]> work_data;
+    std::vector<std::future<void>> tasks;
+    size_t work_size    = 0;
+    size_t desired_size = 0;
+    int n_threads       = GGML_DEFAULT_N_THREADS;
+};
+
+struct qnn_op_caps_t {
+    const char * qnn_op_name        = nullptr;
+    const size_t input_param_count  = 0;
+    const char * qnn_param_name     = nullptr;
+};
+extern const qnn_op_caps_t ggmlqnn_k_op_caps[];
+
+#if ENABLE_QNNBACKEND_PERF
+class qnn_perf {
+public:
+    qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
+    qnn_perf() = delete;
+    qnn_perf(const qnn_perf & ) = delete;
+    qnn_perf & operator= (const qnn_perf & ) = delete;
+
+    void start() {
+        _begin_time = ggml_time_us();
+    }
+
+    void info() {
+        _end_time = ggml_time_us();
+        _duration = (_end_time - _begin_time);
+        GGMLQNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
+    }
+
+private:
+    int64_t _begin_time = 0LL;
+    int64_t _end_time   = 0LL;
+    int64_t _duration   = 0LL;
+    std::string _perf_name;
+};
+#else
+class qnn_perf {
+public:
+    qnn_perf(const std::string & perf_name) {
+        GGML_UNUSED(perf_name);
+    }
+    qnn_perf() = delete;
+    qnn_perf(const qnn_perf & ) = delete;
+    qnn_perf & operator= (const qnn_perf & ) = delete;
+
+    void start() {}
+    void info() {}
+};
+#endif
+
+class qnn_interface {
+#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name)           \
+  template <typename... Args>                                     \
+  inline auto qnn_##F(Args... args) const {                       \
+    return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \
+        std::forward<Args>(args)...);                             \
+  }
+
+
+#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name)                  \
+  template <typename... Args>                                                \
+  inline auto qnn_##F(Args... args) const {                                  \
+    return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \
+        std::forward<Args>(args)...);                                        \
+  }
+
+    friend class qnn_instance;
+
+public:
+    qnn_interface() = default;
+
+    // QnnBackend
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion)
+
+    // QnnDevice
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo)
+
+    // QnnContext
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree)
+
+    // QnnGraph
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve)
+
+    // QnnLog
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel)
+
+    // QnnProfile
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree)
+
+    // QnnMem
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister)
+
+    // QnnProperty
+    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability)
+
+    // QnnTensor
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor)
+
+    // QnnSystem
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate)
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo)
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree)
+
+    void set_qnn_interface(const QnnInterface_t * qnn_interface) {
+        _qnn_interface = qnn_interface;
+    }
+
+    void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) {
+        _qnn_sys_interface = qnn_sys_interface;
+    }
+
+    uint32_t get_backend_id() const {
+        return _qnn_interface->backendId;
+    }
+
+    bool is_loaded() const {
+        return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr));
+    }
+
+private:
+    const QnnInterface_t * _qnn_interface           = nullptr;
+
+    const QnnSystemInterface_t * _qnn_sys_interface = nullptr;
+};
+
+class qnn_instance {
+public:
+    using BackendIdType = decltype(QnnInterface_t{}.backendId);
+
+    explicit qnn_instance(const std::string & lib_path, const std::string & backend_name,
+                          const std::string & model_name) :
+            _lib_path(std::move(lib_path)),
+            _backend_name(std::move(backend_name)),
+            _model_name(std::move(model_name)) {}
+
+    ~qnn_instance() {
+    }
+
+    int qnn_init(const QnnSaver_Config_t ** saver_config);
+
+    int qnn_finalize();
+
+    const qnn_interface & get_qnn_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_interface;
+    }
+
+    const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_raw_interface;
+    }
+
+    const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_raw_system_interface;
+    }
+
+    Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
+
+    Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
+
+    Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
+
+    Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
+
+    Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
+
+    QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
+
+    Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
+
+    int init_qnn_graph(const char * graph_name,
+                       bool debug,
+                       uint8_t do_node_validation = 1,
+                       const QnnGraph_Config_t ** graph_configs = nullptr
+    );
+    int init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
+
+    int finalize_qnn_graph();
+
+    bool is_valid_graph() const { return _qnn_graph_handle != nullptr; }
+
+    int init_htp_perfinfra();
+
+    int set_rpc_polling();
+
+    int set_high_performance_mode();
+
+    std::string & get_qnn_graph_name() { return _graph_name; }
+
+    bool is_rpcmem_initialized() {
+        return _rpcmem_initialized;
+    }
+
+    void set_rpcmem_initialized(bool initialized) {
+        _rpcmem_initialized = initialized;
+    }
+
+    size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
+    size_t get_rpcmem_usage() { return _rpcmem_usage; }
+
+    int32_t rpcmem_to_fd(void * buf);
+
+    int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor);
+    Qnn_MemHandle_t  register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type);
+
+    void unregister_rpcmem();
+    void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
+
+    void * alloc_rpcmem(size_t bytes, size_t alignment);
+    void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle);
+
+    void free_rpcmem(void * buf);
+    void free_rpcmem();
+
+    bool is_rpcmem_allocated(void * buf);
+
+    bool is_rpcmem_registered(Qnn_MemHandle_t handle) {
+        return _qnn_mem_set.count(handle) != 0U;
+    }
+
+    bool enable_qnn_rpc() {
+        return _enable_qnn_rpc;
+    }
+
+public:
+    std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>> _qnn_graph_map;
+
+private:
+    int load_system();
+
+    int unload_system();
+
+    int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config);
+
+    int unload_backend();
+
+    void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) {
+        _qnn_raw_interface = raw_interface;
+    }
+
+    void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) {
+        _qnn_raw_system_interface = raw_interface;
+    }
+
+    void * alloc_rpcmem_internal(size_t bytes, size_t alignment);
+
+    void probe_device_meminfo();
+
+private:
+    static constexpr const int _required_num_providers = 1;
+
+private:
+    std::string     _lib_path;
+    std::string     _backend_name;
+    std::string     _model_name; // name of prebuilt QNN model, might be used in the future
+    BackendIdType   _backend_id;
+
+    bool _debug_tensor                      = false; // flag to indicate if requested graph is to be run in debug mode
+    bool _do_node_validations               = true;  // flag to indicate whether all add_node calls need to be validated
+    QnnLog_Level_t _qnn_log_level           = QNN_LOG_LEVEL_DEBUG;
+
+    ggml_qnn_profile_level _profile_level   = ggml_qnn_profile_level::profile_detail;
+
+    void * _system_lib_handle               = nullptr;
+
+    Qnn_GraphHandle_t _qnn_graph_handle     = nullptr;
+
+    Qnn_LogHandle_t _qnn_log_handle         = nullptr;
+
+    Qnn_ProfileHandle_t _qnn_profile_handle = nullptr;
+
+    Qnn_DeviceHandle_t _qnn_device_handle   = nullptr;
+
+    Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
+
+    Qnn_ContextHandle_t _qnn_context_handle = nullptr;
+
+    QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
+
+    QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr;
+    uint32_t _qnn_power_configid            = 1;
+    uint32_t _qnn_rpc_pollingtime           = 9999; // 0-10000 us for high performing
+
+    qnn_interface _qnn_interface;
+    QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
+    QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
+
+    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
+    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
+
+    static std::mutex _init_mutex;
+    static std::unordered_map<BackendIdType, void *> _loaded_lib_handle;
+    static std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
+    static std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
+
+    std::atomic_bool _rpcmem_initialized{false};
+    pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
+    pfn_rpc_mem_free _pfn_rpc_mem_free;
+    pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
+    pfn_rpc_mem_init  _pfn_rpc_mem_init;
+    pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
+    std::unordered_map<void *, void *> _rpcmem_store_map;
+    std::unordered_map<void *, size_t> _rpcmem_usage_map;
+    size_t                             _rpcmem_usage    = 0;   // mempool usage in Mbytes
+    size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
+
+    std::string _graph_name;
+    QNNBackend _device_id;
+    void * _rpc_lib_handle      = nullptr;
+    bool       _enable_qnn_rpc  = false; //TODO:unknown issue with QNN RPC feature
+
+    DISABLE_COPY(qnn_instance);
+    DISABLE_MOVE(qnn_instance);
+};
+
+size_t         ggmlqnn_get_opcaps_size(void);
+size_t         ggmlqnn_get_op_index(const ggml_tensor * tensor);
+Qnn_Tensor_t * ggmlqnn_create_compute_tensor(const ggml_tensor * tensor);
+const char   * ggmlqnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code);
+Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype);
+void         * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op);
+void           ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & output);
+uint8_t      * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata);
+void           ggmlqnn_print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
+                                Qnn_Param_t * params, uint32_t num_params,
+                                Qnn_Tensor_t * inputs, uint32_t num_inputs,
+                                Qnn_Tensor_t * outputs, uint32_t num_outputs);
+Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
+                                Qnn_TensorType_t qnn_tensor_type,
+                                Qnn_DataType_t qnn_data_type,
+                                uint32_t rank, uint32_t * dims,
+                                void * data, uint32_t data_size,
+                                bool b_transpose = false);
diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
new file mode 100644
index 0000000000000..00cb7da32c183
--- /dev/null
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -0,0 +1,842 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include "ggml-impl.h"
+#include "ggml-common.h"
+#include "ggml-qnn-ops.h"
+
+static inline uint32_t ggmlqnn_get_tensor_data_size(const ggml_tensor * tensor) {
+    /*
+    size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
+    size_t n_dims = ggml_get_tensor_rank(tensor);
+    for (int i = 1; i < n_dims; i++) {
+        data_size *= tensor->ne[i];
+    }
+
+    return data_size;
+    */
+    return ggml_nbytes(tensor);
+}
+
+static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
+                             const ggml_tensor * src1, ggml_tensor * dst) {
+    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
+        GGMLQNN_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    qnn_instance * instance = ctx->instance;
+    if (nullptr == instance) {
+        GGMLQNN_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    return true;
+}
+
+#define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                          \
+    do {                                                                    \
+        if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {       \
+            return;                                                         \
+        }                                                                   \
+    } while (0)
+
+/*
+ * provide a general skeleton to offload ggml op to QNN backend: a single node contains 2 input
+ * tensor and 1 output tensor
+*/
+void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
+    enum ggml_status result                     = GGML_STATUS_SUCCESS;
+    bool graph_initialized                      = false;
+    qnn_instance * instance                     = nullptr;
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    Qnn_Tensor_t * p_tensor0                    = nullptr;
+    Qnn_Tensor_t * p_tensor1                    = nullptr;
+    Qnn_Tensor_t * p_tensor2                    = nullptr;
+    Qnn_Param_t qnn_params[]                    = {};
+    const ggml_tensor * src0                    = op->src[0];
+    const ggml_tensor * src1                    = op->src[1];
+    ggml_tensor * dst                           = op;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+    size_t qnn_op_index                         = ggmlqnn_get_op_index(op);
+    GGML_ASSERT(qnn_op_index < ggmlqnn_get_opcaps_size());
+    const char * qnn_op_name                    = ggmlqnn_k_op_caps[qnn_op_index].qnn_op_name;
+    std::string ggml_op_name_string             = std::string("ggml_") + ggml_op_name(op->op);
+    const char * ggml_op_name                   = ggml_op_name_string.c_str();
+
+    qnn_perf op_perf                            = qnn_perf(ggml_op_name);
+    op_perf.start();
+
+    std::string graph_name;
+    ggmlqnn_get_graphkey_from_op(op, graph_name);
+    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+        graph_initialized = true;
+        qnn_res_t & graph_item = instance->_qnn_graph_map[graph_name];
+        graph_handle = std::get<0>(graph_item);
+        qnn_tensors_t & tensor = std::get<1>(graph_item);
+        p_tensor0     = tensor[0];
+        p_tensor1     = tensor[1];
+        p_tensor2     = tensor[2];
+    } else {
+        p_tensor0 = ggmlqnn_create_compute_tensor(src0);
+        p_tensor1 = ggmlqnn_create_compute_tensor(src1);
+        p_tensor2 = ggmlqnn_create_compute_tensor(dst);
+    }
+    //ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
+
+    //ensure QNN tensor has correct tensor type
+    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
+
+    //save the original dimensions of qnn tensors
+    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
+    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
+    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
+
+    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
+
+    if (!graph_initialized) {
+        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
+        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
+            return;
+        }
+        graph_handle = instance->get_qnn_graph_handle();
+
+        if (enable_npu_rpc) {
+            QNN_VER_PTR(*p_tensor0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {.data=nullptr, .dataSize=0};
+
+            QNN_VER_PTR(*p_tensor1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {.data=nullptr, .dataSize=0};
+
+            QNN_VER_PTR(*p_tensor2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*p_tensor2)->clientBuf = {.data=nullptr, .dataSize=0};
+        }
+
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
+
+        if (enable_npu_rpc) {
+            uint8_t * qnn_rpcbuffer_0 = ggmlqnn_create_rpc_buffer(instance, src0, p_tensor0, true);
+            uint8_t * qnn_rpcbuffer_1 = ggmlqnn_create_rpc_buffer(instance, src1, p_tensor1, true);
+            uint8_t * qnn_rpcbuffer_2 = ggmlqnn_create_rpc_buffer(instance, dst, p_tensor2, false);
+            if (nullptr == qnn_rpcbuffer_0 || nullptr == qnn_rpcbuffer_1 || nullptr == qnn_rpcbuffer_2) {
+                GGMLQNN_LOG_INFO("create rpc buffer failure\n");
+                //TODO: potential memory leak although it shouldn't happen
+                return;
+            }
+        } else {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+        }
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *p_tensor0,
+                *p_tensor1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *p_tensor2
+        };
+        Qnn_OpConfig_t op_config = {
+                QNN_OPCONFIG_VERSION_1, .v1 = {
+                        ggml_op_name,
+                        QNN_OP_PACKAGE_NAME_QTI_AISW,
+                        qnn_op_name,
+                        0,
+                        qnn_params,
+                        2,
+                        tensor_inputs,
+                        1,
+                        tensor_outputs
+                }
+        };
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                            tensor_inputs, 2,
+                                                            tensor_outputs, 1,
+                                                            nullptr, nullptr));
+
+        if (enable_npu_rpc) {
+            uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer = %p\n", qnn_rpcbuffer);
+            if (nullptr != qnn_rpcbuffer) {
+                memcpy(dst->data, qnn_rpcbuffer, ggml_nbytes(dst));
+            }
+        }
+
+        qnn_tensors_t ggml_op_add_tensors;
+        ggml_op_add_tensors.reserve(3);
+        ggml_op_add_tensors.push_back(p_tensor0);
+        ggml_op_add_tensors.push_back(p_tensor1);
+        ggml_op_add_tensors.push_back(p_tensor2);
+
+        auto  graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
+        instance->_qnn_graph_map[graph_name] = graph_item;
+    } else {
+        Qnn_DataType_t src0_qnn_type    = QNN_DATATYPE_FLOAT_32;
+        Qnn_DataType_t src1_qnn_type    = QNN_DATATYPE_FLOAT_32;
+        Qnn_DataType_t dst_qnn_type     = QNN_DATATYPE_FLOAT_32;
+
+        src0_qnn_type                   = ggmlqnn_datatype_from_ggml_datatype(src0->type);
+        src1_qnn_type                   = ggmlqnn_datatype_from_ggml_datatype(src1->type);
+        dst_qnn_type                    = ggmlqnn_datatype_from_ggml_datatype(dst->type);
+
+        uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+                                         (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
+        uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
+                                         (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
+        uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+                                         (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
+
+        QNN_VER_PTR(*p_tensor0)->dimensions  = dimensions_input_0;
+        QNN_VER_PTR(*p_tensor0)->rank        = ggml_n_dims(src0);
+        QNN_VER_PTR(*p_tensor0)->dataType    = src0_qnn_type;
+
+        QNN_VER_PTR(*p_tensor1)->dimensions  = dimensions_input_1;
+        QNN_VER_PTR(*p_tensor1)->rank        = ggml_n_dims(src1);
+        QNN_VER_PTR(*p_tensor1)->dataType    = src1_qnn_type;
+
+        QNN_VER_PTR(*p_tensor2)->dimensions  = dimensions_output;
+        QNN_VER_PTR(*p_tensor2)->rank        = ggml_n_dims(dst);
+        QNN_VER_PTR(*p_tensor2)->dataType    = dst_qnn_type;
+
+        if (enable_npu_rpc) {
+            //TODO: NPU RPC feature will failed with test-backend-ops
+            uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
+            if (nullptr != qnn_buffer_0) {
+                memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
+            }
+
+            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor1)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
+            if (nullptr != qnn_buffer_1) {
+                memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
+            }
+        } else {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+        }
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *p_tensor0,
+                *p_tensor1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *p_tensor2
+        };
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                            tensor_inputs, 2,
+                                                            tensor_outputs, 1,
+                                                            nullptr, nullptr));
+
+        if (enable_npu_rpc) {
+            //TODO:NPU RPC feature will failed with test-backend-ops
+            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
+            if (nullptr != qnn_buffer_2) {
+                memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
+            }
+        }
+    }
+
+    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
+    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
+    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
+    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
+
+#if GGMLQNN_PRINT_OP_ADD_LOG
+    op_perf.info();
+#endif
+}
+
+/*
+ * this function is AI-assisted code from Grok 3 for purpose of offload 4d matrix mulmat to QNN backend
+ * UT in ggml-qnn-ut.cpp passed:
+ * ./scripts/build-run-android.sh run_ut_mulmat 0
+ * ./scripts/build-run-android.sh run_ut_mulmat 1
+ * ./scripts/build-run-android.sh run_ut_mulmat 2
+ *
+ * the logic of ggml_qnn_mul_mat_4d is similar to ggml_qnn_mul_mat but much more complicated
+ * than ggml_qnn_mul_mat, so it's a standalone function.
+ * it will be combined with ggml_qnn_mul_mat in the future
+ */
+static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    bool graph_initialized = false;
+    qnn_perf op_perf = qnn_perf("ggml_qnn_mul_mat_4d");
+    qnn_instance *instance = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
+
+    const ggml_tensor *src0 = op->src[0];
+    const ggml_tensor *src1 = op->src[1];
+    ggml_tensor *dst = op;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4);
+    op_perf.start();
+
+    std::string graph_name;
+    ggmlqnn_get_graphkey_from_op(op, graph_name);
+    GGMLQNN_LOG_DEBUG("graph name %s\n", graph_name.c_str());
+
+    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
+
+    Qnn_GraphHandle_t graph_handle = nullptr;
+    Qnn_Tensor_t *p_tensor0 = nullptr;
+    Qnn_Tensor_t *p_reshape0_out = nullptr;
+    Qnn_Tensor_t *p_tile0_out = nullptr;
+    Qnn_Tensor_t *p_tensor1 = nullptr;
+    Qnn_Tensor_t *p_permute1_out = nullptr;
+    Qnn_Tensor_t *p_reshape1_out = nullptr;
+    Qnn_Tensor_t *p_matmul_out = nullptr;
+    Qnn_Tensor_t *p_reshape2_out = nullptr;
+
+    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+        graph_initialized = true;
+        qnn_res_t &graph_item = instance->_qnn_graph_map[graph_name];
+        graph_handle = std::get<0>(graph_item);
+        qnn_tensors_t &tensors = std::get<1>(graph_item);
+        p_tensor0 = tensors[0];
+        p_reshape0_out = tensors[1];
+        p_tile0_out = tensors[2];
+        p_tensor1 = tensors[3];
+        p_permute1_out = tensors[4];
+        p_reshape1_out = tensors[5];
+        p_matmul_out = tensors[6];
+        p_reshape2_out = tensors[7];
+    } else {
+        CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
+                                                           graph_name.c_str(), NULL, &graph_handle));
+
+        // Define dimensions
+        uint32_t K = src0->ne[0];               // Inner dimension
+        uint32_t M = src0->ne[1];               // Rows of src0
+        uint32_t N = src1->ne[1];               // Columns of src1
+        uint32_t B0 = src0->ne[2] * src0->ne[3]; // src0 batch
+        uint32_t B1 = src1->ne[2] * src1->ne[3]; // src1 batch (drives output)
+
+        // Validate K only
+        GGML_ASSERT(src0->ne[0] == src1->ne[0]); // K must match
+
+        // src0: [K, M, H0, B0] -> QNN: [B0, H0, M, K]
+        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
+        p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
+                          src0_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+
+        // Reshape src0 to [B0, M, K]
+        uint32_t reshape0_out_dims[] = {B0, M, K};
+        p_reshape0_out = GQCGT(nullptr, "reshape0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                               reshape0_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape0_out));
+        Qnn_Tensor_t reshape0_inputs[] = {*p_tensor0};
+        Qnn_Tensor_t reshape0_outputs[] = {*p_reshape0_out};
+        Qnn_OpConfig_t reshape0_op = ggmlqnn_create_op_config("reshape0", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                              QNN_OP_RESHAPE, nullptr, 0,
+                                                              reshape0_inputs, 1, reshape0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape0_op));
+
+        // Tile src0 to match B1: [B0, M, K] -> [B1, M, K]
+        uint32_t tile0_out_dims[] = {B1, M, K};
+        p_tile0_out = GQCGT(nullptr, "tile0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                            tile0_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tile0_out));
+        uint32_t tile_multiples[] = {B1 / B0, 1, 1};
+        uint32_t tile_dims[] = {3};
+        Qnn_Tensor_t *p_tile_multiples = GQCGT(nullptr, "tile_multiples", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                               tile_dims, tile_multiples, sizeof(tile_multiples));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tile_multiples));
+        Qnn_Param_t tile_params[] = {{QNN_PARAMTYPE_TENSOR, "multiples", .tensorParam = *p_tile_multiples}};
+        Qnn_Tensor_t tile0_inputs[] = {*p_reshape0_out};
+        Qnn_Tensor_t tile0_outputs[] = {*p_tile0_out};
+        Qnn_OpConfig_t tile0_op = ggmlqnn_create_op_config("tile0", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                           QNN_OP_TILE, tile_params, 1,
+                                                           tile0_inputs, 1, tile0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, tile0_op));
+
+        // src1: [N, K, H1, B1] -> QNN: [B1, H1, N, K]
+        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
+        p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
+                          src1_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+
+        // Permute src1 to [B1, H1, K, N]
+        uint32_t perm_data[] = {0, 1, 3, 2};
+        uint32_t perm_dims[] = {4};
+        Qnn_Tensor_t *p_perm = GQCGT(nullptr, "perm", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                     perm_dims, perm_data, sizeof(perm_data));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_perm));
+        uint32_t permute1_out_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1])};
+        p_permute1_out = GQCGT(nullptr, "permute1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
+                               permute1_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_permute1_out));
+        Qnn_Param_t permute1_params[] = {{QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_perm}};
+        Qnn_Tensor_t permute1_inputs[] = {*p_tensor1};
+        Qnn_Tensor_t permute1_outputs[] = {*p_permute1_out};
+        Qnn_OpConfig_t permute1_op = ggmlqnn_create_op_config("permute1", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                              QNN_OP_TRANSPOSE, permute1_params, 1,
+                                                              permute1_inputs, 1, permute1_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, permute1_op));
+
+        // Reshape src1 to [B1, K, N]
+        uint32_t reshape1_out_dims[] = {B1, K, N};
+        p_reshape1_out = GQCGT(nullptr, "reshape1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                               reshape1_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape1_out));
+        Qnn_Tensor_t reshape1_inputs[] = {*p_permute1_out};
+        Qnn_Tensor_t reshape1_outputs[] = {*p_reshape1_out};
+        Qnn_OpConfig_t reshape1_op = ggmlqnn_create_op_config("reshape1", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                              QNN_OP_RESHAPE, nullptr, 0,
+                                                              reshape1_inputs, 1, reshape1_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape1_op));
+
+        // MatMul: [B1, M, K] x [B1, K, N] -> [B1, M, N]
+        uint32_t matmul_out_dims[] = {B1, M, N};
+        p_matmul_out = GQCGT(nullptr, "matmul_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                             matmul_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_matmul_out));
+        Qnn_Tensor_t matmul_inputs[] = {*p_tile0_out, *p_reshape1_out};
+        Qnn_Tensor_t matmul_outputs[] = {*p_matmul_out};
+        Qnn_OpConfig_t matmul_op = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                            QNN_OP_MAT_MUL, nullptr, 0,
+                                                            matmul_inputs, 2, matmul_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
+
+        // Output: [N, M, H1, B1] -> QNN: [B1, H1, M, N]
+        uint32_t reshape2_out_dims[] = {static_cast<uint32_t>(dst->ne[3]), static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[0])};
+        p_reshape2_out = GQCGT(dst, "output", QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
+                               reshape2_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape2_out));
+        Qnn_Tensor_t reshape2_inputs[] = {*p_matmul_out};
+        Qnn_Tensor_t reshape2_outputs[] = {*p_reshape2_out};
+        Qnn_OpConfig_t reshape2_op = ggmlqnn_create_op_config("reshape2", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                              QNN_OP_RESHAPE, nullptr, 0,
+                                                              reshape2_inputs, 1, reshape2_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape2_op));
+
+        // Finalize
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
+
+        // Cache
+        qnn_tensors_t ggml_op_mulmat_tensors = {p_tensor0, p_reshape0_out, p_tile0_out, p_tensor1, p_permute1_out, p_reshape1_out, p_matmul_out, p_reshape2_out};
+        instance->_qnn_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+    }
+
+    // Execute
+    QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
+    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
+    QNN_VER_PTR(*p_reshape2_out)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
+
+    Qnn_Tensor_t input_tensors[] = {*p_tensor0, *p_tensor1};
+    Qnn_Tensor_t output_tensors[] = {*p_reshape2_out};
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2,
+                                                        output_tensors, 1, NULL, NULL));
+
+#if 0
+    // Log dst for debugging
+    float *dst_data = (float *)dst->data;
+    GGMLQNN_LOG_DEBUG("dst shape: [%d, %d, %d, %d]\n", dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
+    for (int i = 0; i < dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3]; i++) {
+        GGMLQNN_LOG_DEBUG("dst[%d] = %f\n", i, dst_data[i]);
+    }
+#endif
+
+    op_perf.info();
+}
+
+/*
+ * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
+ *        using the QNN backend. this function performs matrix multiplication of the input tensor
+ *        `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
+ *        and stores the result in the destination tensor `dst`.
+ *
+ * @param backend the context which got through (ggml_backend_qnn_context *)backend->context for the
+ *                QNN backend operations.
+ * @param op      the destination tensor where the result of the matrix multiplication will be stored.
+ *
+ * @note the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
+ *       than ggml_qnn_general_node. so it's a standalone function. accordingly, this is another
+ *       typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute
+ *       time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds
+ *       of MUL_MAT to compute:
+ *       mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
+ *       mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
+ *       mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, Q6_K...)
+ *                        and src1 is F32, src0 -> f32 in src0', then src0' * src1
+*/
+void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
+    bool graph_initialized                      = false;
+    qnn_perf op_perf                            = qnn_perf("ggml_qnn_mul_mat");
+    qnn_instance * instance                     = nullptr;
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    Qnn_Tensor_t * p_tensor0                    = nullptr;
+    Qnn_Tensor_t * p_tensor1                    = nullptr;
+    Qnn_Tensor_t * p_tensor2                    = nullptr;
+    Qnn_Tensor_t * p_param_tensor               = nullptr;
+    Qnn_Tensor_t * p_tensor2_transpose          = nullptr;
+    const ggml_tensor * src0                    = op->src[0];
+    const ggml_tensor * src1                    = op->src[1];
+    ggml_tensor       * dst                     = op;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+    op_perf.start();
+
+    const enum ggml_type src0_type              = src0->type;
+    const uint32_t src0_rank                    = ggml_n_dims(src0);
+    const uint32_t src1_rank                    = ggml_n_dims(src1);
+    GGML_ASSERT(src0_rank == src1_rank);
+    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
+    if (4 == src0_rank) {
+        return ggml_qnn_mul_mat_4d(ctx, op);
+    }
+    void * wdata                                = ggmlqnn_type_trait(ctx, op);
+    const size_t desired_size                   = ctx->desired_size;
+
+    std::string graph_name;
+    ggmlqnn_get_graphkey_from_op(op, graph_name);
+    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+        graph_initialized       = true;
+        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
+        graph_handle            = std::get<0>(graph_item);
+        qnn_tensors_t & tensors = std::get<1>(graph_item);
+        p_tensor0               = tensors[0];
+        p_tensor1               = tensors[1];
+        p_tensor2               = tensors[2];
+        p_param_tensor          = tensors[3];
+        p_tensor2_transpose     = tensors[4];
+    } else {
+        p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor1 = GQCGT(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor2 = GQCGT(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+    }
+    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
+
+    //ensure QNN tensor has correct tensor type
+    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
+
+    //save the original dimensions of qnn tensors
+    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
+    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
+    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
+
+    if (!graph_initialized) {
+        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
+        /*
+         there are two key-points in properly handling how to offload mulmat to the QNN backend in ggml-qnn
+         1. transpose
+            a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from:
+            struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
+            which like this:
+            +---+---+
+            | 0 | 1 |
+            +---+---+
+            | 2 | 3 |
+            +---+---+
+            | 4 | 5 |
+            +---+---+
+            with
+                ne[0] = 2
+                ne[1] = 3
+            there are different dimension order between ggml tensor and qnn tensor
+
+          2. QNN's MatMul can only support input tensors with rank >= 2
+
+             in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
+             operation when offloading mulmat to QNN backend. this concise implementation will handle
+             transpose in func ggml_qnn_create_general_tensor()
+        */
+        //step-1: create qnn graph
+        error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
+                                              graph_name.c_str(), nullptr, &graph_handle);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
+            return;
+        }
+        //step-2: create param tensor for mulmat of 2d/3d/4d matrix
+        const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
+                {0},
+                {1, 0},
+                {0, 2, 1},
+                {0, 1, 3, 2},
+        };
+        uint32_t param_tensor_dims[1]   = {src0_rank};
+        p_param_tensor = GQCGT(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, param_tensor_dims, (void *)(param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
+
+        //step-3: create compute tensor from ggml tensor
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
+        if (src0_type != GGML_TYPE_F32) {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
+        } else {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+        }
+        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+
+        //step-4: create a transpose tensor
+        p_tensor2_transpose = GQCGT(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0, true);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2_transpose));
+
+        //step-5: compose qnn graph: add mat_mul node
+        Qnn_Param_t out_0_params[] = {
+                {QNN_PARAMTYPE_SCALAR,
+                 QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
+                        .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}
+                }
+        };
+
+        Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0, *p_tensor1};
+        Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
+#if 0 //leave here for easily understand code, can be removed in the future
+        Qnn_OpConfig_t out_0 = {
+                QNN_OPCONFIG_VERSION_1, .v1 =
+                        {"ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
+                         1,
+                         out_0_params,
+                         2,
+                         out_0_inputs,
+                         1,
+                         out_0_outputs}
+        };
+#else
+        Qnn_OpConfig_t out_0 = ggmlqnn_create_op_config("ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
+                                                        out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
+#endif
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
+
+        //step-5: compose qnn graph: add transpose node
+        Qnn_Param_t out_trans1_0_params[] = {
+                {QNN_PARAMTYPE_TENSOR,
+                 "perm", .tensorParam = *p_param_tensor
+                }
+        };
+        Qnn_Tensor_t out_trans1_0_inputs[]  = {*p_tensor2_transpose};
+        Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
+#if 0 //leave here for easily understand code, can be removed in the future
+        Qnn_OpConfig_t out_trans1_0 = {
+                QNN_OPCONFIG_VERSION_1,
+                .v1 =  {"ggmlqnn_mulmat_transpose_opconfig",
+                        QNN_OP_PACKAGE_NAME_QTI_AISW,
+                        QNN_OP_TRANSPOSE, 1,
+                        out_trans1_0_params,
+                        1,
+                        out_trans1_0_inputs,
+                        1,
+                        out_trans1_0_outputs}
+        };
+#else
+        Qnn_OpConfig_t out_trans1_0 = ggmlqnn_create_op_config("ggmlqnn_mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
+                                                               out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
+#endif
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
+
+        //step-6: finalize qnn graph and execute qnn graph
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+        Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0, *p_tensor1};
+        Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                            input_tensors_0, 2,
+                                                            output_tensors_0, 1,
+                                                            nullptr, nullptr));
+
+        qnn_tensors_t ggml_op_mulmat_tensors;
+        ggml_op_mulmat_tensors.reserve(5);
+        ggml_op_mulmat_tensors.push_back(p_tensor0);
+        ggml_op_mulmat_tensors.push_back(p_tensor1);
+        ggml_op_mulmat_tensors.push_back(p_tensor2);
+        ggml_op_mulmat_tensors.push_back(p_param_tensor);
+        ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
+        auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+        instance->_qnn_graph_map[graph_name] = graph_item;
+    } else {
+        if (src0_type != GGML_TYPE_F32) {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
+        } else {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+        }
+        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *p_tensor0,
+                *p_tensor1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *p_tensor2
+        };
+        // this is the second technical approach or another pipeline of "how to utilize the Hexagon
+        // NPU maximally" through QNN SDK, details could be found at
+        // https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                            tensor_inputs, 2,
+                                                            tensor_outputs, 1,
+                                                            nullptr, nullptr));
+    }
+
+    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
+    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
+    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
+    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
+    op_perf.info();
+}
+
+void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(value);
+}
+
+void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    ggml_qnn_dup(ctx, dst);
+}
+
+void ggml_qnn_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.h b/ggml/src/ggml-qnn/ggml-qnn-ops.h
new file mode 100644
index 0000000000000..b1c388a32a87a
--- /dev/null
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+
+#include "ggml-qnn-impl.h"
+void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+
+void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value);
+void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index e862b07a234eb..c830128f750c8 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -13,13 +13,11 @@
  * section-5 does ggml-qnn backend helper macro / data structure / function / class
  * section-6 does implementation of ggml-qnn backend according to ggml's backend subsystem
  *
- * currently provide following ggml ops' QNN backend implementation:
+ * currently provide following ggml ops' QNN backend implementation in ggml-qnn-ops.cpp:
  * - GGML_OP_ADD:    this is a simple skeleton, can expand other ggml ops according to expertise
  * - GGML_OP_MUL:    this is a simple skeleton, can expand other ggml ops according to expertise
  * - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex ggml ops accordingly
  *
- * of course, can porting ggml-qnn to Windows on ARM as need.
- *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
@@ -38,96 +36,23 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stddef.h>
-#include <inttypes.h>
-#include <math.h>
-#include <time.h>
-#include <unistd.h>
-#include <dlfcn.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/sysinfo.h>
-#include <unistd.h>
-
-#include <string>
-#include <vector>
-#include <thread>
-#include <mutex>
-#include <map>
-#include <set>
-#include <tuple>
-#include <queue>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <chrono>
-#include <memory>
-#include <regex>
-#include <random>
-#include <functional>
-#include <unordered_map>
-#include <condition_variable>
-#include <cassert>
-#include <unordered_set>
-#include <utility>
-#include <stdatomic.h>
-#if (defined __ANDROID__) || (defined ANDROID)
-#include "android/log.h"
-#endif
-
-#include "QnnTypes.h"
-#include "QnnCommon.h"
-#include "QnnContext.h"
-#include "QnnBackend.h"
-#include "QnnGraph.h"
-#include "QnnProperty.h"
-#include "QnnTensor.h"
-#include "QnnInterface.h"
-#include "Saver/QnnSaver.h"
-#include "System/QnnSystemInterface.h"
-#include "HTP/QnnHtpDevice.h"
-#include "HTP/QnnHtpGraph.h"
-
-#include "ggml-qnn.h"
-#include "ggml-impl.h"
-#include "ggml-backend-impl.h"
+#include "ggml-qnn-impl.h"
+#include "ggml-qnn-ops.h"
 // =================================================================================================
 //  section-1: forward/external declaration
 // =================================================================================================
-class qnn_instance;
-struct ggml_backend_qnn_context;
-static int free_qnn_tensor(Qnn_Tensor_t * tensor);
+static int  free_qnn_tensor(Qnn_Tensor_t * tensor);
 static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
+typedef void (* ggmlqnn_op_func_t)(ggml_backend_qnn_context * ctx, ggml_tensor * op);
 
 // =================================================================================================
 //  section-2: ggml-qnn internal troubleshooting function
 // =================================================================================================
-#define GGMLQNN_DEBUG                           1  // for troubleshooting QNN backend
-#define GGML_QNN_LOGBUF_LEN                     4096
-#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
-#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
-#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
-#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            1
-
-#define GGMLQNN_LOG_ERROR(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG,  __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLQNN_LOG_WARN(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLQNN_LOG_INFO(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-
-#if GGMLQNN_DEBUG
-#define GGMLQNN_LOG_DEBUG(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#else
-#define GGMLQNN_LOG_DEBUG(...)
-#endif
-static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
+void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
     static std::mutex ggmlqnn_log_internal_mutex;
     static char s_ggmlqnn_log_internal_buf[GGML_QNN_LOGBUF_LEN];
 
+    GGML_UNUSED(file);
     {
         std::lock_guard<std::mutex> lock(ggmlqnn_log_internal_mutex);
         va_list args;
@@ -138,11 +63,11 @@ static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const
 #if (defined __ANDROID__) || (defined ANDROID)
             //for Android application(standard APP or command line tool)
             __android_log_print(ANDROID_LOG_INFO, "ggml-qnn", "%s\n", s_ggmlqnn_log_internal_buf);
-#endif
-#if (defined __ANDROID__) || (defined ANDROID)
-            //do nothing when running on Snapdragon based Android device
+            if (GGML_LOG_LEVEL_INFO == level) {
+                printf("%s\n", s_ggmlqnn_log_internal_buf);
+            }
 #else
-            //for Snapdragon based WoA(Windows on ARM) device
+            //for Snapdragon based WoA(Windows on ARM) device or Linux
             printf("%s\n", s_ggmlqnn_log_internal_buf);
 #endif
         }
@@ -153,16 +78,48 @@ static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const
 // =================================================================================================
 //  section-3: general helper macro / data structure / function
 // =================================================================================================
-#define DISABLE_COPY(class_name)                \
-    class_name(const class_name &) = delete;    \
-    void operator=(const class_name &) = delete
+#if defined(_WIN32)
+static const char * last_func = nullptr;
+static long last_err;
+void * dlopen(const char * dll, int flags) {
+  HINSTANCE h = LoadLibraryA(dll);
+  GGML_UNUSED(flags);
+  if (h == NULL) {
+    last_err  = GetLastError();
+    last_func = "dlopen";
+  }
+  return h;
+}
 
-#define DISABLE_MOVE(class_name)                \
-    class_name(class_name &&) = delete;         \
-    void operator=(class_name &&) = delete
+int dlclose(void * h) {
+  if (!FreeLibrary((HINSTANCE)h)) {
+    last_err  = GetLastError();
+    last_func = "dlclose";
+    return -1;
+  }
+  return 0;
+}
 
-#define GGMLQNN_MEM_ADD(alignment)              (sizeof (size_t) + alignment)
-#define GGMLQNN_MEM_MASK(alignment)             ((uintptr_t)alignment - 1)
+void * dlsym(void * h, const char * name) {
+  FARPROC p = GetProcAddress((HINSTANCE)h, name);
+  if (!p) {
+    last_err  = GetLastError();
+    last_func = "dlsym";
+  }
+  return (void*)(intptr_t)p;
+}
+
+const char * dlerror(void) {
+  static char str[512];
+  if (!last_err) return nullptr;
+
+  snprintf(str, 512, "%s error #%ld", last_func, last_err);
+  last_err  = 0;
+  last_func = NULL;
+
+  return str;
+}
+#endif
 
 static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
     return offset % alignment == 0 ? offset
@@ -171,62 +128,40 @@ static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
                                       offset % static_cast<intptr_t>(alignment));
 }
 
-static void * ggmlqnn_mallocz_aligned(size_t size, size_t alignment) {
-    uint8_t * buffer = NULL;
-    size_t * sp = NULL;
-    buffer = static_cast<uint8_t *>(calloc(1, size + GGMLQNN_MEM_ADD(alignment)));
-    if (!buffer)
-        return NULL;
-    sp = (size_t *)buffer;
-    *sp = size;
-    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
-    buffer[-1] = buffer - (uint8_t *)sp;
-    return buffer;
-}
-
-static void * ggmlqnn_malloc_aligned(size_t size, size_t alignment) {
-    uint8_t * buffer = NULL;
-    size_t * sp = NULL;
-    buffer = static_cast<uint8_t *>(malloc(size + GGMLQNN_MEM_ADD(alignment)));
-    if (!buffer)
-        return NULL;
-    sp = (size_t *)buffer;
-    *sp = size;
-    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
-    buffer[-1] = buffer - (uint8_t *)sp;
-    return buffer;
-}
-
-static void ggmqnn_free_aligned(void * ptr) {
-    uint8_t * old = (uint8_t *)ptr;
-    if (!old)
-        return;
-    old -= old[-1];
-    free(old);
-}
-
 static size_t get_system_total_memory_in_bytes() {
+#if defined(__ANDROID__) || defined(__linux__)
     struct sysinfo info = {};
-    if (sysinfo(&info) == 0) {
+    if (0 == sysinfo(&info)) {
         return (info.totalram + info.totalswap) * info.mem_unit;
     }
-
     auto pages = (size_t)sysconf(_SC_PHYS_PAGES);
     auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
 
     return pages * page_size;
+#elif defined(_WIN32)
+    //TODO: Snapdragon based WoA(Windows on ARM)
+    return 0;
+#else
+#error "ggml-qnn only support WoA, Android, Linux"
+#endif
 }
 
 static size_t get_system_free_memory_in_bytes() {
+#if defined(__ANDROID__) || defined(__linux__)
     struct sysinfo info = {};
-    if (sysinfo(&info) == 0) {
+    if (0 == sysinfo(&info)) {
         return (info.freeram + info.freeswap) * info.mem_unit;
     }
-
     auto avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES);
     auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
 
     return avail_pages * page_size;
+#elif defined(_WIN32)
+    //TODO: Snapdragon based WoA(Windows on ARM)
+    return 0;
+#else
+#error "ggml-qnn only support WoA, Android, Linux"
+#endif
 }
 
 static size_t ggmlqnn_memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) {
@@ -241,16 +176,23 @@ static size_t ggmlqnn_memscpy(void * dst, size_t dst_size, const void * src, siz
 }
 
 static char * ggmlqnn_strndup(const char * source, size_t maxlen) {
-    return ::strndup(source, maxlen);
+    return strndup(source, maxlen);
 }
 
 static void * ggmlqnn_host_malloc(size_t n) {
-    void * data = NULL;
-    int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
+#if defined(__ANDROID__) || defined(__linux__)
+    void * data = nullptr;
+    int result = posix_memalign((void **)&data, sysconf(_SC_PAGESIZE), n);
     if (result != 0) {
         GGMLQNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__);
-        return NULL;
+        return nullptr;
     }
+#elif defined(_WIN32)
+    //TODO: Snapdragon based WoA(Windows on ARM)
+    return nullptr;
+#else
+#error "ggml-qnn only support WoA, Android, Linux"
+#endif
 
     return data;
 }
@@ -258,57 +200,6 @@ static void * ggmlqnn_host_malloc(size_t n) {
 // =================================================================================================
 //  section-4: QNN helper macro / data structure / function
 // =================================================================================================
-#define VALIDATE(value, status)                                                 \
-  do {                                                                          \
-    status = value;                                                             \
-    if (status != QNN_SUCCESS) {                                                \
-      GGMLQNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value);                    \
-      return status;                                                            \
-    }                                                                           \
-  } while (0)
-
-#define CHECK_QNN_API(error, result)                                            \
-    do {                                                                        \
-        error = (result);                                                       \
-        if (QNN_SUCCESS != error) {                                             \
-            if (error == QNN_COMMON_ERROR_NOT_SUPPORTED) {                      \
-                GGMLQNN_LOG_WARN("WARNING: QNN feature/API not supported\n");   \
-            } else {                                                            \
-                GGMLQNN_LOG_INFO("QNN API error = %d(%s)\n", error, qnn_get_error_string(error));  \
-            }                                                                   \
-        }                                                                       \
-    } while (0)
-
-#define VALIDATE_TENSOR_VERSION(tensor, err)            VALIDATE(validate_tensor_version(tensor), err)
-
-#define VALIDATE_OP_CONFIG_VERSION(op, err)             VALIDATE(validate_op_config_version(op), err)
-
-#define QNN_VER_PTR(x)                                  (&((x).v1))
-#define QNN_OP_CFG_VALID(op_config)                      ((op_config).version == QNN_OPCONFIG_VERSION_1)
-
-#define QNN_OP_CFG_GET_NAME(op_config)                   get_qnn_oponfig_name(op_config)
-#define QNN_OP_CFG_GET_PACKAGE_NAME(op_config)           get_qnn_op_config_packagename(op_config)
-#define QNN_OP_CFG_GET_TYPE_NAME(op_config)              get_qnn_op_config_typename(op_config)
-#define QNN_OP_CFG_GET_NUM_PARAMS(op_config)             get_qnn_op_config_numparams(op_config)
-#define QNN_OP_CFG_GET_PARAMS(op_config)                 get_qnn_op_config_params(op_config)
-#define QNN_OP_CFG_GET_NUM_INPUTS(op_config)             get_qnn_op_config_numinputs(op_config)
-#define QNN_OP_CFG_GET_INPUTS(op_config)                 get_qnn_op_config_inputs(op_config)
-#define QNN_OP_CFG_GET_NUM_OUTPUTS(op_config)            get_qnn_op_config_numoutputs(op_config)
-#define QNN_OP_CFG_GET_OUTPUTS(op_config)                get_qnn_op_config_outputs(op_config)
-
-#define QNN_OP_CFG_SET_NAME(op_config, value)            set_qnn_op_config_name(op_config, value)
-#define QNN_OP_CFG_SET_PACKAGE_NAME(op_config, value)    set_qnn_op_config_packagename(op_config, value)
-#define QNN_OP_CFG_SET_TYPE_NAME(op_config, value)       set_qnn_op_config_typename(op_config, value)
-
-#define QNN_OP_CFG_SET_PARAMS(op_config, num_of_params, params) \
-  set_qnn_op_config_params(op_config, num_of_params, params)
-
-#define QNN_OP_CFG_SET_INPUTS(op_config, num_of_inputs, inputTensors) \
-  set_qnn_op_config_inputs(op_config, num_of_inputs, inputTensors)
-
-#define QNN_OP_CFG_SET_OUTPUTS(op_config, num_of_outputs, output_tensors) \
-  set_qnn_op_config_outputs(op_config, num_of_outputs, output_tensors)
-
 #define QNN_TENSOR_GET_ID(tensor)                       get_qnn_tensorid(tensor)
 #define QNN_TENSOR_GET_NAME(tensor)                     get_qnn_tensorname(tensor)
 #define QNN_TENSOR_GET_TYPE(tensor)                     get_qnn_tensortype(tensor)
@@ -333,200 +224,6 @@ static void * ggmlqnn_host_malloc(size_t n) {
 #define QNN_TENSOR_SET_CLIENT_BUF(tensor, value)        set_qnn_tensor_clientbuf(tensor, value)
 #define QNN_TENSOR_SET_MEM_HANDLE(tensor, value)        set_qnn_tensor_memhandle(tensor, value)
 
-static inline int validate_tensor_version(Qnn_Tensor_t tensor) {
-    if (tensor.version != QNN_TENSOR_VERSION_1) {
-        GGMLQNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n",
-              tensor.v1.name,
-              tensor.version);
-        return 1;
-    }
-    return 0;
-}
-
-[[maybe_unused]] static inline int validate_op_config_version(Qnn_OpConfig_t op_config) {
-    if (op_config.version != QNN_OPCONFIG_VERSION_1) {
-        GGMLQNN_LOG_WARN("validate_op_config_version() op %s, got unsupported version %d\n",
-              op_config.v1.name,
-              op_config.version);
-        return 1;
-    }
-    return 0;
-}
-
-static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.name;
-    }
-    return nullptr;
-}
-
-[[maybe_unused]] static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_oponfig_name(*op_config);
-}
-
-static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.packageName;
-    }
-    return nullptr;
-}
-
-[[maybe_unused]] static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_packagename(*op_config);
-}
-
-static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.typeName;
-    }
-    return nullptr;
-}
-
-[[maybe_unused]] static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_typename(*op_config);
-}
-
-static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.numOfParams;
-    }
-    return 0u;
-}
-
-[[maybe_unused]] static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_numparams(*op_config);
-}
-
-static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.params;
-    }
-    return nullptr;
-}
-
-[[maybe_unused]] static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_params(*op_config);
-}
-
-static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.numOfInputs;
-    }
-    return 0u;
-}
-
-[[maybe_unused]] static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_numinputs(*op_config);
-}
-
-static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.inputTensors;
-    }
-    return nullptr;
-}
-
-[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_inputs(*op_config);
-}
-
-static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.numOfOutputs;
-    }
-    return 0u;
-}
-
-[[maybe_unused]] static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_numoutputs(*op_config);
-}
-
-static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.outputTensors;
-    }
-    return nullptr;
-}
-
-[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_outputs(*op_config);
-}
-
-static inline void set_qnn_op_config_name(Qnn_OpConfig_t & op_config, const char * name) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        op_config.v1.name = name;
-    }
-}
-
-[[maybe_unused]] static inline void set_qnn_op_config_name(Qnn_OpConfig_t * op_config, const char * name) {
-    set_qnn_op_config_name(*op_config, name);
-}
-
-static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t & op_config, const char * package_name) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        op_config.v1.packageName = package_name;
-    }
-}
-
-[[maybe_unused]] static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t * op_config, const char * package_name) {
-    set_qnn_op_config_packagename(*op_config, package_name);
-}
-
-static inline void set_qnn_op_config_typename(Qnn_OpConfig_t & op_config, const char * type_name) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        op_config.v1.typeName = type_name;
-    }
-}
-
-[[maybe_unused]] static inline void set_qnn_op_config_typename(Qnn_OpConfig_t * op_config, const char * type_name) {
-    set_qnn_op_config_typename(*op_config, type_name);
-}
-
-static inline void set_qnn_op_config_params(Qnn_OpConfig_t & op_config,
-                                 uint32_t num_of_params,
-                                 Qnn_Param_t * params) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        op_config.v1.numOfParams = num_of_params;
-        op_config.v1.params      = params;
-    }
-}
-
-[[maybe_unused]] static inline void set_qnn_op_config_params(Qnn_OpConfig_t * op_config,
-                                 uint32_t num_of_params,
-                                 Qnn_Param_t * params) {
-    set_qnn_op_config_params(*op_config, num_of_params, params);
-}
-
-static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t & op_config,
-                                 uint32_t num_of_inputs,
-                                 Qnn_Tensor_t * input_tensors) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        op_config.v1.numOfInputs  = num_of_inputs;
-        op_config.v1.inputTensors = input_tensors;
-    }
-}
-
-[[maybe_unused]] static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t * op_config,
-                                 uint32_t num_of_inputs,
-                                 Qnn_Tensor_t * input_tensors) {
-    set_qnn_op_config_inputs(*op_config, num_of_inputs, input_tensors);
-}
-
-static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t & op_config,
-                                  uint32_t num_of_outputs,
-                                  Qnn_Tensor_t * output_tensors) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        op_config.v1.numOfOutputs  = num_of_outputs;
-        op_config.v1.outputTensors = output_tensors;
-    }
-}
-
-[[maybe_unused]] static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t * op_config,
-                                  uint32_t num_of_outputs,
-                                  Qnn_Tensor_t * output_tensors) {
-    set_qnn_op_config_outputs(*op_config, num_of_outputs, output_tensors);
-}
-
 static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.id;
@@ -535,10 +232,6 @@ static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) {
     return 0u;
 }
 
-[[maybe_unused]] static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensorid(*tensor);
-}
-
 static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.name;
@@ -546,10 +239,6 @@ static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) {
     return nullptr;
 }
 
-static inline const char * get_qnn_tensorname(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensorname(*tensor);
-}
-
 static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.type;
@@ -557,10 +246,6 @@ static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) {
     return QNN_TENSOR_TYPE_UNDEFINED;
 }
 
-[[maybe_unused]] static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensortype(*tensor);
-}
-
 static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.dataFormat;
@@ -568,10 +253,6 @@ static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_
     return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER;
 }
 
-[[maybe_unused]] static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_dataformat(*tensor);
-}
-
 static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.dataType;
@@ -579,10 +260,6 @@ static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor
     return QNN_DATATYPE_UNDEFINED;
 }
 
-[[maybe_unused]] static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_datatype(*tensor);
-}
-
 static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.quantizeParams;
@@ -590,10 +267,6 @@ static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t
     return QNN_QUANTIZE_PARAMS_INIT;
 }
 
-[[maybe_unused]] static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_quantparams(*tensor);
-}
-
 static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.rank;
@@ -601,10 +274,6 @@ static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) {
     return 0u;
 }
 
-[[maybe_unused]] static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_rank(*tensor);
-}
-
 static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.dimensions;
@@ -612,10 +281,6 @@ static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor)
     return nullptr;
 }
 
-[[maybe_unused]] static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_dimensions(*tensor);
-}
-
 static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.memType;
@@ -623,161 +288,78 @@ static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & te
     return QNN_TENSORMEMTYPE_UNDEFINED;
 }
 
-[[maybe_unused]] static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_memtype(*tensor);
-}
-
-static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.clientBuf;
-    }
-    return QNN_CLIENT_BUFFER_INIT;
-}
-
-[[maybe_unused]] static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_clientbuf(*tensor);
-}
-
-static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.memHandle;
-    }
-    return nullptr;
-}
-
-[[maybe_unused]] static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_memhandle(*tensor);
-}
-
 static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.id = id;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_id(Qnn_Tensor_t * tensor, uint32_t id) {
-    set_qnn_tensor_id(*tensor, id);
-}
-
 static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.name = name;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_name(Qnn_Tensor_t * tensor, const char * name) {
-    set_qnn_tensor_name(*tensor, name);
-}
-
 static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.type = type;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_type(Qnn_Tensor_t * tensor, Qnn_TensorType_t type) {
-    set_qnn_tensor_type(*tensor, type);
-}
-
 static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.dataFormat = format;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t * tensor, Qnn_TensorDataFormat_t format) {
-    set_qnn_tensor_dataformat(*tensor, format);
-}
-
 static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.dataType = dataType;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_datatype(Qnn_Tensor_t * tensor, Qnn_DataType_t dataType) {
-    set_qnn_tensor_datatype(*tensor, dataType);
-}
-
 static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.quantizeParams = params;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t * tensor, Qnn_QuantizeParams_t params) {
-    set_qnn_tensor_quantparams(*tensor, params);
-}
-
 static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.rank = rank;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_rank(Qnn_Tensor_t * tensor, uint32_t rank) {
-    set_qnn_tensor_rank(*tensor, rank);
-}
-
 static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.dimensions = dims;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t * tensor, uint32_t * dims) {
-    set_qnn_tensor_dimensions(*tensor, dims);
-}
-
 static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.memType = memType;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_memtype(Qnn_Tensor_t * tensor, Qnn_TensorMemType_t memType) {
-    set_qnn_tensor_memtype(*tensor, memType);
-}
-
 static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.clientBuf = clientBuf;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t * tensor, Qnn_ClientBuffer_t clientBuf) {
-    set_qnn_tensor_clientbuf(*tensor, clientBuf);
-}
-
 static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.memHandle = handle;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t * tensor, Qnn_MemHandle_t handle) {
-    set_qnn_tensor_memhandle(*tensor, handle);
-}
-
-inline static Qnn_Tensor_t qnn_tensor_init(Qnn_TensorVersion_t version) {
-    Qnn_Tensor_t tensor;
-    tensor.version = version;
-    if (version == QNN_TENSOR_VERSION_1) {
-        tensor.v1 = QNN_TENSOR_V1_INIT;
-    } else if (version == QNN_TENSOR_VERSION_2) {
-        tensor.v2 = QNN_TENSOR_V2_INIT;
-    }
-    return tensor;
-}
-
 static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
     int err = 0;
-    VALIDATE_TENSOR_VERSION(src, err);
 
     dst.version = src.version;
-    QNN_TENSOR_SET_NAME(
-            dst, ggmlqnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size()));
-    if (QNN_TENSOR_GET_NAME(dst) == nullptr) {
+    QNN_TENSOR_SET_NAME(dst, ggmlqnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size()));
+    if (nullptr == QNN_TENSOR_GET_NAME(dst)) {
         return 1;
     }
     QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src));
@@ -796,20 +378,20 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
     }
 
     Qnn_QuantizeParams_t src_qparam      = QNN_TENSOR_GET_QUANT_PARAMS(src);
-    Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding;
+    Qnn_QuantizationEncoding_t encoding  = src_qparam.quantizationEncoding;
     if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
-        Qnn_QuantizeParams_t src_qparam_cpy      = src_qparam;
+        Qnn_QuantizeParams_t src_qparam_cpy       = src_qparam;
         Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding;
-        Qnn_ScaleOffset_t ** scale_offset          = &axis_scale_offset.scaleOffset;
+        Qnn_ScaleOffset_t ** scale_offset         = &axis_scale_offset.scaleOffset;
         size_t scale_offset_size = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t);
-        *scale_offset           = (Qnn_ScaleOffset_t *)malloc(scale_offset_size);
+        *scale_offset            = (Qnn_ScaleOffset_t *)malloc(scale_offset_size);
         ggmlqnn_memscpy(*scale_offset,
                         scale_offset_size,
                         src_qparam.axisScaleOffsetEncoding.scaleOffset,
                         scale_offset_size);
         QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy);
     } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
-        Qnn_QuantizeParams_t src_qparam_cpy          = src_qparam;
+        Qnn_QuantizeParams_t src_qparam_cpy           = src_qparam;
         Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding;
         size_t scale_size                          = bwaxis_scale_offset.numElements * sizeof(float);
         float ** scales                            = &bwaxis_scale_offset.scales;
@@ -831,7 +413,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
     QNN_TENSOR_SET_RANK(dst, rank);
     size_t dim_size       = GGML_MAX_DIMS * sizeof(uint32_t);
     uint32_t * dimensions = (uint32_t *)malloc(dim_size);
-    if (dimensions == nullptr) {
+    if (nullptr == dimensions) {
         GGMLQNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src));
         return 1;
     }
@@ -843,10 +425,8 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
 
 static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
     int err = 0;
-    VALIDATE_TENSOR_VERSION(*tensor, err);
     free((void *) QNN_TENSOR_GET_NAME(*tensor));
-
-    Qnn_QuantizeParams_t src_qparam      = QNN_TENSOR_GET_QUANT_PARAMS(*tensor);
+    Qnn_QuantizeParams_t src_qparam     = QNN_TENSOR_GET_QUANT_PARAMS(*tensor);
     Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding;
     if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
         free(src_qparam.axisScaleOffsetEncoding.scaleOffset);
@@ -862,55 +442,7 @@ static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
     return err;
 }
 
-
-static size_t qnn_datatype_size(Qnn_DataType_t qnn_type) {
-    switch (qnn_type) {
-        case QNN_DATATYPE_FLOAT_32:
-            return sizeof(float);
-        case QNN_DATATYPE_FLOAT_16:
-            return sizeof(uint16_t);
-        case QNN_DATATYPE_UINT_32:
-        case QNN_DATATYPE_INT_32:
-            return sizeof(int32_t);
-        case QNN_DATATYPE_INT_16:
-            return sizeof(int16_t);
-        case QNN_DATATYPE_INT_8:
-            return sizeof(int8_t);
-        case QNN_DATATYPE_SFIXED_POINT_8:
-            return sizeof(int8_t);
-        case QNN_DATATYPE_SFIXED_POINT_4:
-            return sizeof(int8_t);
-        default:
-            break;
-    }
-    return 0;
-}
-
-static const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type) {
-    switch (qnn_type) {
-        case QNN_DATATYPE_FLOAT_32:
-            return "QNN_DATATYPE_FLOAT_32";
-        case QNN_DATATYPE_FLOAT_16:
-            return "QNN_DATATYPE_FLOAT_16";
-        case QNN_DATATYPE_UINT_32:
-            return "QNN_DATATYPE_UINT_32";
-        case QNN_DATATYPE_INT_32:
-            return "QNN_DATATYPE_INT_32";
-        case QNN_DATATYPE_INT_16:
-            return "QNN_DATATYPE_INT_16";
-        case QNN_DATATYPE_INT_8:
-            return "QNN_DATATYPE_INT_8";
-        case QNN_DATATYPE_SFIXED_POINT_8:
-            return "QNN_DATATYPE_SFIXED_POINT_8";
-        case QNN_DATATYPE_SFIXED_POINT_4:
-            return "QNN_DATATYPE_SFIXED_POINT_4";
-        default:
-            break;
-    }
-    return "QNN_DATATYPE_UNDEFINED";
-}
-
-static const char * qnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
+const char * ggmlqnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
     // file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/api_error_codes.html
     switch (qnn_error_code) {
         case QNN_SUCCESS:
@@ -1013,59 +545,24 @@ static const char * qnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
     }
 }
 
+// helper function to create an operation config
+Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
+                                       Qnn_Param_t * params, uint32_t num_params,
+                                       Qnn_Tensor_t * inputs, uint32_t num_inputs,
+                                       Qnn_Tensor_t * outputs, uint32_t num_outputs) {
+    Qnn_OpConfigV1_t v1 = {name, package, type,
+                           num_params, params,
+                           num_inputs, inputs,
+                           num_outputs, outputs
+    };
+    Qnn_OpConfig_t opcfg = {QNN_OPCONFIG_VERSION_1, {v1}};
+
+    return opcfg;
+}
+
 // =================================================================================================
 //  section-5:ggml-qnn backend helper macro / data structure / function / class
 // =================================================================================================
-#define RPCMEM_DEFAULT_FLAGS                    1
-#define RPCMEM_HEAP_ID_SYSTEM                   25
-
-typedef void (* ggmlqnn_op_func_t)(ggml_backend_t backend, ggml_tensor * op);
-
-using pfn_rpc_mem_init                                  = void (*)(void);
-using pfn_rpc_mem_deinit                                = void (*)(void);
-using pfn_rpc_mem_alloc                                 = void *(*)(int, uint32_t, int);
-using pfn_rpc_mem_free                                  = void (*)(void *);
-using pfn_rpc_mem_to_fd                                 = int (*)(void *);
-using _pfn_QnnSaver_initialize                          = decltype(QnnSaver_initialize);
-using _pfn_QnnInterface_getProviders                    = decltype(QnnInterface_getProviders);
-using _pfn_QnnSystemInterface_getProviders              = decltype(QnnSystemInterface_getProviders);
-
-using qnn_res_t                                         = std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>;
-using qnn_tensors_t                                     = std::vector< Qnn_Tensor_t *>;
-
-enum class ggml_qnn_profile_level {
-    profile_off     = 0,
-    profile_basic   = 1,
-    profile_detail  = 2
-};
-
-enum qcom_htp_arch {
-    NONE = 0,
-    V68 = 68,
-    V69 = 69,
-    V73 = 73,
-    V75 = 75,
-    V79 = 79,
-};
-
-enum qcom_chipset_soc_model {
-    UNKNOWN_SM = 0,
-    SM7450 = 41,  // v69, 7 Gen1
-    SM8350 = 30,  // v68, 888
-    SM8450 = 36,  // v69, SD 8 Gen 1
-    SM8475 = 42,  // v69, SD 8+ Gen 1
-    SM8550 = 43,  // v73, SD 8 Gen 2
-    SM8650 = 57,  // v75, SD 8 Gen 3
-    SM8750 = 69,  // v79, SD 8 Gen 4
-};
-
-struct qcom_socinfo {
-    uint32_t soc_model;
-    size_t htp_arch;
-    size_t vtcm_size_in_mb;
-    char soc_desc[GGML_MAX_NAME];
-};
-
 //file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
 static struct qcom_socinfo g_qnn_soc_info_table[] = {
         /* Qualcomm SnapDragon 7 Gen 1 */
@@ -1117,20 +614,30 @@ static struct qcom_socinfo g_qnn_soc_info_table[] = {
                 .vtcm_size_in_mb   = 8,
                 .soc_desc          = "Qualcomm SnapDragon 8 Gen 4"},
 
-};
+#if defined(_WIN32)
+        /* Qualcomm SnapDragon 7c Gen 2 */
+        [SC7280X] = {
+                .soc_model         = SC7280X,
+                .htp_arch          = V68,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 7c Gen 2"},
 
-struct ggml_backend_qnn_context {
-    int device;
-    int threads;
-    char name[GGML_MAX_NAME];
-    char desc[GGML_MAX_NAME];
-    char lib[GGML_MAX_NAME];
-    qnn_instance * instance;
-    struct ggml_backend * backend;
-    QNN_INTERFACE_VER_TYPE raw_interface;
-    QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
-    struct qcom_socinfo           socinfo;
-} ;
+        /* Qualcomm SnapDragon 8cx Gen 3 */
+        [SC8280X] = {
+                .soc_model         = SC8280X,
+                .htp_arch          = V68,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8cx Gen 3"},
+
+        /* Qualcomm SnapDragon 8cx Gen 4 */
+        [SC8380XP] = {
+                .soc_model         = SC8380XP,
+                .htp_arch          = V73,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8cx Gen 4"},
+#endif
+
+};
 
 //the following helper funcs are used to ensure every QNN tensor name is unique
 static std::atomic<int32_t>  g_ggmltensor_idx(0);
@@ -1157,7 +664,11 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .threads              = 1,
                 .name                 = "qnn-cpu",
                 .desc                 = "Qualcomm Kryo CPU",
+#if defined(_WIN32)
+                .lib                  = "QnnCpu.dll",
+#else
                 .lib                  = "libQnnCpu.so",
+#endif
                 .instance             = nullptr,
                 .backend              = nullptr,
                 .raw_interface        = {},
@@ -1168,7 +679,11 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .threads              = 1,
                 .name                 = "qnn-gpu",
                 .desc                 = "Qualcomm Adreno GPU",
+#if defined(_WIN32)
+                .lib                  = "QnnGpu.dll",
+#else
                 .lib                  = "libQnnGpu.so",
+#endif
                 .instance             = nullptr,
                 .backend              = nullptr,
                 .raw_interface        = {},
@@ -1179,7 +694,11 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .threads              = 1,
                 .name                 = "qnn-npu",
                 .desc                 = "Qualcomm NPU(Hexagon Tensor Processor)",
+#if defined(_WIN32)
+                .lib                  = "QnnHtp.dll",
+#else
                 .lib                  = "libQnnHtp.so",
+#endif
                 .instance             = nullptr,
                 .backend              = nullptr,
                 .raw_interface        = {},
@@ -1187,13 +706,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .socinfo              = {}},
 };
 
-struct qnn_op_caps_t {
-    const char * qnn_op_name = nullptr;
-    const size_t input_param_count = 0;
-    const char * qnn_param_name = nullptr;
-};
-
-static const qnn_op_caps_t k_op_caps[] = {
+const qnn_op_caps_t ggmlqnn_k_op_caps[] = {
         {}, // GGML_OP_NONE
         {}, // GGML_OP_DUP
         {
@@ -1353,54 +866,6 @@ static struct qcom_socinfo * qnn_get_socinfo_from_socmodel(uint32_t soc_model) {
     return nullptr;
 }
 
-static bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                                    const ggml_tensor * src1, ggml_tensor * dst) {
-    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
-        GGMLQNN_LOG_WARN("invalid params\n");
-        return false;
-    }
-
-    qnn_instance * instance = ctx->instance;
-    if (nullptr == instance) {
-        GGMLQNN_LOG_WARN("invalid params\n");
-        return false;
-    }
-
-    return true;
-}
-
-#define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                          \
-    do {                                                            \
-        if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {   \
-            return;                                                 \
-        }                                                           \
-    } while (0)
-
-static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) {
-    /*
-    uint32_t rank = 0;
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) {
-            rank++;
-        }
-    }
-    return rank;
-    */
-    return ggml_n_dims(tensor);
-}
-
-static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) {
-    /*
-    size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
-    size_t n_dims = ggml_get_tensor_rank(tensor);
-    for (int i = 1; i < n_dims; i++) {
-        data_size *= tensor->ne[i];
-    }
-
-    return data_size;
-    */
-    return ggml_nbytes(tensor);
-}
 
 static const char * ggml_get_type_name(ggml_type type) {
     const struct ggml_type_traits * traits = ggml_get_type_traits(type);
@@ -1412,9 +877,8 @@ static const char * get_ggml_type_name(ggml_type type) {
     return traits->type_name;
 }
 
-//TODO:
 // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
-static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
+Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
     switch (ggmltype) {
         case GGML_TYPE_F16:
             return QNN_DATATYPE_FLOAT_16;
@@ -1432,7 +896,6 @@ static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
     return QNN_DATATYPE_UNDEFINED;
 }
 
-//TODO:
 static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
     switch (qnn_type) {
         case QNN_DATATYPE_FLOAT_32:
@@ -1456,23 +919,32 @@ static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
     return GGML_TYPE_COUNT;
 }
 
-//TODO: add more ops
-static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) {
-    switch (ggmlop) {
-        case GGML_OP_ADD:
-            return QNN_OP_ELEMENT_WISE_ADD;
-        case GGML_OP_MUL_MAT:
-            return QNN_OP_MAT_MUL;
-        default:
-            break;
+static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, const uint32_t * ggml_dimensions, uint32_t rank) {
+    if (rank > GGML_MAX_DIMS) {
+        GGMLQNN_LOG_WARN("invalid params");
+        return;
+    }
+    if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) {
+        GGMLQNN_LOG_WARN("invalid params");
+        return;
+    }
+    for (size_t idx = 0; idx < GGML_MAX_DIMS; idx++)
+        qnn_dimensions[idx] = ggml_dimensions[idx];
+
+    if (rank >= 2) {
+        qnn_dimensions[rank - 1] = ggml_dimensions[rank - 2];
+        qnn_dimensions[rank - 2] = ggml_dimensions[rank - 1];
     }
-    return nullptr;
 }
 
-static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor, const char * name, Qnn_TensorType_t qnn_tensor_type,
-                                                    Qnn_DataType_t qnn_data_type, uint32_t rank, uint32_t * dims, void * data, uint32_t data_size) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    char tensor_name[GGML_MAX_NAME] = {0};
+Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
+                                                     Qnn_TensorType_t qnn_tensor_type,
+                                                     Qnn_DataType_t qnn_data_type,
+                                                     uint32_t rank, uint32_t * dims,
+                                                     void * data, uint32_t data_size,
+                                                     bool b_transpose) {
+    Qnn_ErrorHandle_t error         = QNN_SUCCESS;
+    char tensor_name[GGML_MAX_NAME] = {};
 
     //ensure the tensor name is unique
     if (nullptr != name) {
@@ -1483,19 +955,36 @@ static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor,
     GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
     inc_idx();
 
-    uint32_t dimensions_transpose[GGML_MAX_DIMS] = {};
-    uint32_t * tensor_dims = nullptr;
+    uint32_t reverse_dims[GGML_MAX_DIMS]    = {};
+    uint32_t transpose_dims[GGML_MAX_DIMS]  = {};
+    uint32_t * tensor_dims                  = nullptr;
+    //case 1:use dims info from ggml tensor
     if (nullptr != tensor) {
         //there are different dimension order between ggml tensor and qnn tensor
         for (size_t idx = 0; idx < rank; idx++) {
-            dimensions_transpose[idx] = (uint32_t)tensor->ne[rank - 1 - idx];
+            reverse_dims[idx] = (uint32_t)tensor->ne[rank - 1 - idx];
         }
-        tensor_dims = dimensions_transpose;
+        tensor_dims = reverse_dims;
     }
-    //re-assign tensor_dims
+    //case 2: use user's specified tensor_dims
     if (nullptr != dims) {
         tensor_dims = dims;
     }
+    //case 3: transpose for dst tensor
+    if (b_transpose) {
+        GGML_ASSERT(tensor != nullptr); //ensure ggml_tensor is not nullptr for this special case
+
+        get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_n_dims(tensor));
+        tensor_dims = transpose_dims;
+#if 0
+        for (size_t idx = 0; idx < 4; idx++) {
+            GGMLQNN_LOG_DEBUG("origin dim[%d]=%d\n", idx, reverse_dims[idx]);
+        }
+        for (size_t idx = 0; idx < 4; idx++) {
+            GGMLQNN_LOG_DEBUG("trans  dim[%d]=%d\n", idx, transpose_dims[idx]);
+        }
+#endif
+    }
 
     Qnn_Tensor_t qnn_tensor = {
             .version= QNN_TENSOR_VERSION_1,
@@ -1505,14 +994,13 @@ static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor,
                     .type = qnn_tensor_type,
                     .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
                     .dataType = qnn_data_type,
-                    .quantizeParams = {QNN_DEFINITION_UNDEFINED,
-                                       QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                    .quantizeParams = {.encodingDefinition = QNN_DEFINITION_UNDEFINED,
+                                       .quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED,
                                        {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}},
                     .rank = rank,
                     .dimensions = tensor_dims,
                     .memType = QNN_TENSORMEMTYPE_RAW,
-                    {.clientBuf = {nullptr, 0}
-                    }
+                    .clientBuf = {.data = nullptr, .dataSize = 0}
             }
             }
     };
@@ -1526,545 +1014,166 @@ static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor,
     }
     error = deep_copy_qnn_tensors(qnn_tensor, * p_qnn_tensor);
     if (error != QNN_SUCCESS) {
-        free(p_qnn_tensor);
-        GGMLQNN_LOG_WARN("init tensor failed");
-        return  nullptr;
-    }
-    QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {data, data_size};
-
-    return p_qnn_tensor;
-}
-
-static Qnn_Tensor_t * ggml_qnn_create_compute_tensor(const ggml_tensor * tensor) {
-    uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
-                             (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
-    Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
-    Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
-
-    if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
-        qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
-    } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
-        qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
-    }
-
-    qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type);
-    Qnn_Tensor_t * p_qnn_tensor = ggml_qnn_create_general_tensor(tensor, nullptr,
-                                  qnn_tensor_type, qnn_data_type,
-                             ggml_n_dims(tensor), dimensions,
-                             nullptr, 0);
-
-    return p_qnn_tensor;
-}
-
-static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) {
-    char buffer[256] = {};
-    const char * type_name = get_ggml_type_name(tensor->type);
-    int len = 0;
-    switch (ggml_n_dims(tensor)) {
-        case 1:
-            len = snprintf(buffer, sizeof(buffer), "%ldx1%s", (long)tensor->ne[0], type_name);
-            break;
-        case 2:
-            len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name);
-            break;
-        case 3:
-            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
-                           (long)tensor->ne[2], type_name);
-            break;
-        case 4:
-        default:
-            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
-                           (long)tensor->ne[2], (long)tensor->ne[3], type_name);
-            break;
-    }
-    GGML_ASSERT(len > 0 && len < (int)sizeof(buffer));
-    output.append(buffer, len);
-}
-
-static size_t get_qnn_op_index(const ggml_tensor * tensor) {
-    if (tensor->op == GGML_OP_UNARY) {
-        return GGML_OP_COUNT + ggml_get_unary_op(tensor);
-    }
-
-    return tensor->op;
-}
-
-static size_t get_qnn_op_input_param_count(const ggml_tensor * op) {
-    auto op_index = get_qnn_op_index(op);
-    GGML_ASSERT(op_index < std::size(k_op_caps));
-    return k_op_caps[op_index].input_param_count;
-}
-
-static void get_graph_key_from_op(const ggml_tensor * op, std::string & output) {
-    GGML_ASSERT(op->op != GGML_OP_NONE);
-    output += ggml_op_desc(op);
-    output += get_ggml_type_name(op->type);
-    size_t param_count = get_qnn_op_input_param_count(op);
-    for (size_t i = 0; i < param_count; ++i) {
-        auto * input = op->src[i];
-        if (!input) {
-            break;
-        }
-        output += '_';
-        append_tensor_dimensions(input, output);
-    }
-}
-
-#if ENABLE_QNNBACKEND_PERF
-class qnn_perf {
-public:
-    qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
-    qnn_perf() = delete;
-    qnn_perf(const qnn_perf & ) = delete;
-    qnn_perf & operator= (const qnn_perf & ) = delete;
-
-    void start() {
-        _begin_time = ggml_time_us();
-    }
-
-    void info() {
-        _end_time = ggml_time_us();
-        _duration = (_end_time - _begin_time);
-        GGMLQNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
-    }
-
-private:
-    int64_t _begin_time = 0LL;
-    int64_t _end_time   = 0LL;
-    int64_t _duration   = 0LL;
-    std::string _perf_name;
-};
-#else
-class qnn_perf {
-public:
-    qnn_perf(const std::string & perf_name) {}
-    qnn_perf() = delete;
-    qnn_perf(const qnn_perf & ) = delete;
-    qnn_perf & operator= (const qnn_perf & ) = delete;
-
-    void start() {}
-    void info() {}
-};
-#endif
-
-template<typename Fn>
-Fn load_qnn_functionpointers(void * handle, const char * function_name) {
-    return reinterpret_cast<Fn>(dlsym(handle, function_name));
-}
-
-class qnn_interface {
-
-#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name)           \
-  template <typename... Args>                                     \
-  inline auto qnn_##F(Args... args) const {                       \
-    return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \
-        std::forward<Args>(args)...);                             \
-  }
-
-
-#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name)                  \
-  template <typename... Args>                                                \
-  inline auto qnn_##F(Args... args) const {                                  \
-    return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \
-        std::forward<Args>(args)...);                                        \
-  }
-
-    friend class qnn_instance;
-
-public:
-    qnn_interface() = default;
-
-    // QnnBackend
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion);
-
-    // QnnDevice
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo);
-
-    // QnnContext
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree);
-
-    // QnnGraph
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve);
-
-    // QnnLog
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel);
-
-    // QnnProfile
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree);
-
-    // QnnMem
-    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister);
-
-    // QnnProperty
-    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability);
-
-    // QnnTensor
-    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor);
-
-    // QnnSystem
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate);
-
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo);
-
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree);
-
-    void set_qnn_interface(const QnnInterface_t * qnn_interface) {
-        _qnn_interface = qnn_interface;
-    }
-
-    void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) {
-        _qnn_sys_interface = qnn_sys_interface;
-    }
-
-    uint32_t get_backend_id() const {
-        return _qnn_interface->backendId;
-    }
-
-    bool is_loaded() const {
-        return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr));
-    }
-
-private:
-    const QnnInterface_t *_qnn_interface = nullptr;
-
-    const QnnSystemInterface_t *_qnn_sys_interface = nullptr;
-};
-
-class qnn_instance {
-public:
-    using BackendIdType = decltype(QnnInterface_t{}.backendId);
-
-    explicit qnn_instance(const std::string & lib_path, const std::string & backend_name,
-                                const std::string & model_name) :
-            _lib_path(std::move(lib_path)),
-            _backend_name(std::move(backend_name)),
-            _model_name(std::move(model_name)) {};
-
-    ~qnn_instance() {
-    }
-
-    int qnn_init(const QnnSaver_Config_t ** saver_config);
-
-    int qnn_finalize();
-
-    const qnn_interface & get_qnn_interface() {
-        if (!_qnn_interface.is_loaded()) {
-            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
-        }
-        return _qnn_interface;
-    }
-
-    const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() {
-        if (!_qnn_interface.is_loaded()) {
-            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
-        }
-        return _qnn_raw_interface;
-    }
-
-    const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() {
-        if (!_qnn_interface.is_loaded()) {
-            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
-        }
-        return _qnn_raw_system_interface;
-    }
-
-    const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
-
-    const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
-
-    const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
-
-    const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
-
-    const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
-
-    const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
-
-    const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
-
-    int init_qnn_graph(const char * graph_name,
-                       bool debug,
-                       uint8_t do_node_validation = 1,
-                       const QnnGraph_Config_t ** graph_configs = nullptr
-    );
-    int init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
-
-    int finalize_qnn_graph();
-
-    bool is_valid_graph() const { return _qnn_graph_handle != nullptr; }
-
-    int init_htp_perfinfra() {
-        QnnDevice_Infrastructure_t device_infra = nullptr;
-        int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to get qnn device infra\n");
-            return 1;
-        }
-
-        QnnHtpDevice_Infrastructure_t * htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
-        QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra;
-        uint32_t power_configid = 1;
-        uint32_t device_id = 0;
-        uint32_t core_id = 0;
-        htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid);
-        _qnn_htp_perfinfra = htp_perfinfra;
-        _qnn_power_configid = power_configid;
-
-        return 0;
-    }
-
-    int set_rpc_polling() {
-        if (_qnn_rpc_pollingtime > 0) {
-            QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingtime;
-            memset(&rpc_pollingtime, 0, sizeof(rpc_pollingtime));
-            rpc_pollingtime.option =
-                    QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
-            rpc_pollingtime.rpcPollingTimeConfig = _qnn_rpc_pollingtime;
-            const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&rpc_pollingtime, nullptr};
-            if (_qnn_htp_perfinfra) {
-                _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
-            }
-        }
-        return 0;
-    }
-
-    int set_high_performance_mode() {
-        if (nullptr == _qnn_htp_perfinfra) {
-            GGMLQNN_LOG_DEBUG("perf intra is null\n");
-            return 1;
-        }
-
-        QnnHtpPerfInfrastructure_PowerConfig_t power_config;
-        memset(&power_config, 0, sizeof(power_config));
-        power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
-        power_config.dcvsV3Config.dcvsEnable = 0;
-        power_config.dcvsV3Config.setDcvsEnable = 1;
-        power_config.dcvsV3Config.contextId = _qnn_power_configid;
-        power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
-        power_config.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False
-        power_config.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False
-        power_config.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False
-        power_config.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable
-        power_config.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False
-        // set Sleep latency parameter
-        uint32_t latencyValue = 40;
-        power_config.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec
-        // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
-        power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-        power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-        power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-        // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
-        power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-        power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-        power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-        // set power config with different performance parameters
-        const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&power_config, nullptr};
-
-        _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
-
-        return 0;
-    }
-
-    std::string & get_qnn_graph_name() { return _graph_name; }
-
-    bool is_rpcmem_initialized() {
-        return _rpcmem_initialized;
-    }
-
-    void set_rpcmem_initialized(bool initialized) {
-        _rpcmem_initialized = initialized;
-    }
-
-    size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
-    size_t get_rpcmem_usage() { return _rpcmem_usage; }
-
-    int32_t rpcmem_to_fd(void * buf);
-
-    int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor);
-    Qnn_MemHandle_t  register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type);
-
-    void unregister_rpcmem();
-    void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
-
-    void * alloc_rpcmem(size_t bytes, size_t alignment);
-    void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle);
-
-    void free_rpcmem(void * buf);
-    void free_rpcmem();
-
-    bool is_rpcmem_allocated(void * buf);
-
-    bool is_rpcmem_registered(Qnn_MemHandle_t handle) {
-        return _qnn_mem_set.count(handle) != 0U;
+        free(p_qnn_tensor);
+        GGMLQNN_LOG_WARN("init tensor failed");
+        return  nullptr;
     }
+    QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {data, data_size};
 
-    bool enable_qnn_rpc() {
-        return _enable_qnn_rpc;
-    }
+    return p_qnn_tensor;
+}
 
-    void probe_device_meminfo() {
-        size_t candidate_size = 0;
-        uint8_t *rpc_buffer = nullptr;
-        const int SIZE_IN_MB = (1 << 20);
-        size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
-        size_t probe_counts = sizeof(probe_slots) / sizeof(size_t);
-        for (size_t idx = 0; idx < probe_counts; idx++) {
-            rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem_internal(probe_slots[idx] * SIZE_IN_MB, 4));
-            if (nullptr == rpc_buffer) {
-                GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx],
-                                  strerror(errno));
-                break;
-            } else {
-                candidate_size = probe_slots[idx];
-                free_rpcmem(rpc_buffer);
-                rpc_buffer = nullptr;
-            }
-        }
-        if (candidate_size > _rpcmem_capacity)
-            _rpcmem_capacity = candidate_size;
+Qnn_Tensor_t * ggmlqnn_create_compute_tensor(const ggml_tensor * tensor) {
+    uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
+                             (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
+    Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
+    Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
 
-        free_rpcmem();
-        _rpcmem_usage = 0;
-        GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
+    if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
+        qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+    } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
+        qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
     }
 
-public:
-    std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>> _qnn_graph_map;
-
-private:
-    int load_system();
-
-    int unload_system();
+    qnn_data_type = ggmlqnn_datatype_from_ggml_datatype(tensor->type);
+    Qnn_Tensor_t * p_qnn_tensor = ggmlqnn_create_general_tensor(tensor, nullptr,
+                                  qnn_tensor_type, qnn_data_type,
+                                  ggml_n_dims(tensor), dimensions,
+                                  nullptr, 0);
 
-    int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config);
+    return p_qnn_tensor;
+}
 
-    int unload_backend();
+void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
+    const ggml_tensor * src0        = op->src[0];
+    const ggml_tensor * src1        = op->src[1];
+    ggml_tensor * dst               = op;
+    const enum ggml_type src0_type  = src0->type;
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+    GGML_ASSERT(nb00 == ggml_type_size(src0_type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    const int64_t ne_plane = ne01 * ne00;
+    const size_t desired_size = ((GGML_TYPE_F32 == src0_type) ? 0 : ne03 * ne02 * ne_plane * sizeof(float));
+    ctx->desired_size   = desired_size;
+    if (ctx->work_size < desired_size) {
+        ctx->work_data.reset(new char[desired_size]);
+        ctx->work_size  = desired_size;
+    }
+    ctx->n_threads = std::thread::hardware_concurrency();
+    void * wdata = ctx->work_data.get();
+    // convert src0 to float
+    if (src0_type != GGML_TYPE_F32) {
+        const auto * type_traits        = ggml_get_type_traits(src0_type);
+        ggml_to_float_t const to_float  = type_traits->to_float;
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                const void * x          = (char *)src0->data + i02 * nb02 + i03 * nb03;
+                float * const wplane    = (float *)wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
+
+                const int min_cols_per_thread = 4096;
+                const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1);
+                const int n_threads = std::max(
+                        std::min(ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1);
+                for (int i = 1; i < n_threads; i++) {
+                    const int64_t start = i * ne01 / n_threads;
+                    const int64_t end   = (i + 1) * ne01 / n_threads;
+                    if (start < end) {
+                        ctx->tasks.push_back(std::async(std::launch::async, [=]() {
+                            for (int64_t i01 = start; i01 < end; i01++) {
+                                to_float((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
+                            }
+                        }));
+                    }
+                }
+                {
+                    // reuse the current thread for the first task
+                    const int64_t start = 0;
+                    const int64_t end = ne01 / n_threads;
+                    for (int64_t i01 = start; i01 < end; i01++) {
+                        to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
+                    }
+                }
+            }
+        }
 
-    void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) {
-        _qnn_raw_interface = raw_interface;
+        // wait for all tasks to finish
+        for (auto &task: ctx->tasks) {
+            task.get();
+        }
+        ctx->tasks.clear();
     }
+    return wdata;
+}
 
-    void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) {
-        _qnn_raw_system_interface = raw_interface;
+static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) {
+    char buffer[256] = {};
+    const char * type_name = get_ggml_type_name(tensor->type);
+    int len = 0;
+    switch (ggml_n_dims(tensor)) {
+        case 1:
+            len = snprintf(buffer, sizeof(buffer), "%ldx1%s", (long)tensor->ne[0], type_name);
+            break;
+        case 2:
+            len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name);
+            break;
+        case 3:
+            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
+                           (long)tensor->ne[2], type_name);
+            break;
+        case 4:
+        default:
+            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
+                           (long)tensor->ne[2], (long)tensor->ne[3], type_name);
+            break;
     }
-    
-    void * alloc_rpcmem_internal(size_t bytes, size_t alignment);
-
-private:
-    static constexpr const int _required_num_providers = 1;
-
-private:
-    std::string _lib_path;
-    std::string _backend_name;
-    std::string _model_name;               // name of prebuilt QNN model, might be used in the future
-    BackendIdType _backend_id;
-
-    bool _debug_tensor                      = false; // flag to indicate if requested graph is to be run in debug mode
-    bool _do_node_validations               = true;  // flag to indicate whether all add_node calls need to be validated
-    QnnLog_Level_t _qnn_log_level           = QNN_LOG_LEVEL_DEBUG;
-
-    ggml_qnn_profile_level _profile_level   = ggml_qnn_profile_level::profile_detail;
-
-    qnn_interface _qnn_interface;
-
-    void * _system_lib_handle = nullptr;
-
-    Qnn_GraphHandle_t _qnn_graph_handle = nullptr;
-
-    Qnn_LogHandle_t _qnn_log_handle = nullptr;
-
-    Qnn_ProfileHandle_t _qnn_profile_handle = nullptr;
-
-    Qnn_DeviceHandle_t _qnn_device_handle = nullptr;
-
-    Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
-
-    Qnn_ContextHandle_t _qnn_context_handle = nullptr;
-
-    QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
-
-    QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr;
-    uint32_t _qnn_power_configid = 1;
-    uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing
+    GGML_ASSERT(len > 0 && len < (int)sizeof(buffer));
+    output.append(buffer, len);
+}
 
-    QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
-    QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
+size_t ggmlqnn_get_opcaps_size() {
+    return std::size(ggmlqnn_k_op_caps);
+}
 
-    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
-    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
+size_t ggmlqnn_get_op_index(const ggml_tensor * tensor) {
+    if (tensor->op == GGML_OP_UNARY) {
+        return static_cast<size_t>(GGML_OP_COUNT) + static_cast<size_t>(ggml_get_unary_op(tensor));
+    }
 
-    static std::mutex _init_mutex;
-    static std::unordered_map<BackendIdType, void *> _loaded_lib_handle;
-    static std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
-    static std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
+    return tensor->op;
+}
 
-    void * _rpc_lib_handle = nullptr;
-    std::atomic_bool _rpcmem_initialized{false};
-    pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
-    pfn_rpc_mem_free _pfn_rpc_mem_free;
-    pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
-    pfn_rpc_mem_init  _pfn_rpc_mem_init;
-    pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
-    std::unordered_map<void *, void *> _rpcmem_store_map;
-    std::unordered_map<void *, size_t> _rpcmem_usage_map;
-    size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
-    size_t                             _rpcmem_usage    = 0;   // mempool usage in Mbytes
+static size_t ggmlqnn_get_op_input_param_count(const ggml_tensor * op) {
+    auto op_index = ggmlqnn_get_op_index(op);
+    GGML_ASSERT(op_index < std::size(ggmlqnn_k_op_caps));
+    return ggmlqnn_k_op_caps[op_index].input_param_count;
+}
 
-    std::string _graph_name;
-    QNNBackend _device_id;
-    bool       _enable_qnn_rpc = false; //TODO:unknown issue with QNN RPC feature
+void ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & output) {
+    GGML_ASSERT(op->op != GGML_OP_NONE);
+    output += ggml_op_desc(op);
+    output += get_ggml_type_name(op->type);
+    size_t param_count = ggmlqnn_get_op_input_param_count(op);
+    for (size_t i = 0; i < param_count; ++i) {
+        auto * input = op->src[i];
+        if (!input) {
+            break;
+        }
+        output += '_';
+        append_tensor_dimensions(input, output);
+    }
+}
 
-    DISABLE_COPY(qnn_instance);
-    DISABLE_MOVE(qnn_instance);
-};
+template<typename Fn>
+Fn load_qnn_functionpointers(void * handle, const char * function_name) {
+    return reinterpret_cast<Fn>(dlsym(handle, function_name));
+}
 
 std::mutex qnn_instance::_init_mutex;
 std::unordered_map<qnn_instance::BackendIdType, void *> qnn_instance::_loaded_lib_handle;
@@ -2079,13 +1188,13 @@ void * qnn_instance::alloc_rpcmem_internal(size_t bytes, size_t alignment) {
 
     auto allocate_bytes = static_cast<int32_t>(bytes + alignment);
     void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes);
-    if (buf == nullptr) {
+    if (nullptr == buf) {
         GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
         return nullptr;
     }
 
     auto aligned_buf = reinterpret_cast<void *>(ggmlqnn_align_to(alignment,
-                                                         reinterpret_cast<intptr_t>(buf)));
+                                                reinterpret_cast<intptr_t>(buf)));
     bool status = _rpcmem_store_map.insert(std::pair<void *, void *>(aligned_buf, buf)).second;
     if (!status) {
         GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
@@ -2141,8 +1250,6 @@ void qnn_instance::free_rpcmem(void * buf) {
 }
 
 void qnn_instance::free_rpcmem() {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-
     if (_rpcmem_store_map.empty()) {
         GGMLQNN_LOG_WARN("no rpcmem allocated\n");
         return;
@@ -2184,13 +1291,13 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
 
     if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
         GGMLQNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
-        return 4;
+        return 3;
     }
 
     int32_t mem_fd = rpcmem_to_fd(p_data);
     if (-1 == mem_fd) {
         GGMLQNN_LOG_WARN("failed to get file descriptor\n");
-        return 5;
+        return 4;
     }
     GGMLQNN_LOG_DEBUG("mem_fd %d\n", mem_fd);
     Qnn_MemDescriptor_t descriptor = {
@@ -2206,9 +1313,8 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
             /*numDescriptors=*/1,
             &handle);
     if (error != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error),
-              strerror(error));
-        return 6;
+        GGMLQNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), strerror(error));
+        return 5;
     } else {
         GGMLQNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
     }
@@ -2247,8 +1353,7 @@ Qnn_MemHandle_t  qnn_instance::register_rpcmem(void * p_data, const uint32_t ran
             {{mem_fd}}
     };
     Qnn_MemHandle_t handle = nullptr;
-    auto error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor,
-            /*numDescriptors=*/1, &handle);
+    auto error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle);
     if (error != QNN_SUCCESS) {
         GGMLQNN_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error));
         return nullptr;
@@ -2285,8 +1390,7 @@ void qnn_instance::unregister_rpcmem() {
         Qnn_MemHandle_t mem_handle = it->second;
         error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
         if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d\n",
-                         QNN_GET_ERROR_CODE(error));
+            GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error));
         } else {
             GGMLQNN_LOG_DEBUG("unregister shared memory ok");
         }
@@ -2324,9 +1428,9 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
         return 1;
     }
 
-    auto get_providers =
-            load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(lib_handle,
-                                                          "QnnInterface_getProviders");
+    auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(
+                               lib_handle,
+                               "QnnInterface_getProviders");
     if (nullptr == get_providers) {
         GGMLQNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror());
         return 2;
@@ -2386,21 +1490,6 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
     _loaded_lib_handle[backend_id] = lib_handle;
     _backend_id = backend_id;
 
-#if 0 // leave them here for further use
-    QnnSaver_Config_t outputdir_cfg;
-    outputdir_cfg.option = QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY;
-    outputdir_cfg.outputDirectory = "/data/local/tmp/";
-    QnnSaver_Config_t backendid_cfg;
-    backendid_cfg.option = QNN_SAVER_CONFIG_OPTION_BACKEND_ID;
-    backendid_cfg.backendId = _backend_id;
-
-    const QnnSaver_Config_t * saver_cfg[] = {&outputdir_cfg, &backendid_cfg, nullptr};
-    if (0 == QnnSaver_initialize(saver_cfg)) {
-        GGMLQNN_LOG_INFO("QnnSaver_initialize successfully");
-    } else {
-        GGMLQNN_LOG_WARN("QnnSaver_initialize failure");
-    }
-#endif
     auto saver_initialize =
             load_qnn_functionpointers<_pfn_QnnSaver_initialize *>(
             _loaded_lib_handle[backend_id], "QnnSaver_initialize");
@@ -2419,7 +1508,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
 
 int qnn_instance::unload_backend() {
     int dlclose_error = 0;
-    for (auto &it : _loaded_lib_handle) {
+    for (auto & it : _loaded_lib_handle) {
         dlclose_error = dlclose(it.second);
         if (dlclose_error != 0) {
             GGMLQNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror());
@@ -2436,7 +1525,11 @@ int qnn_instance::unload_backend() {
 int qnn_instance::load_system() {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
+#ifdef _WIN32
+    std::string system_lib_path = _lib_path + "QnnSystem.dll";
+#else
     std::string system_lib_path = _lib_path + "libQnnSystem.so";
+#endif
     GGMLQNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str());
 
     _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
@@ -2444,7 +1537,11 @@ int qnn_instance::load_system() {
         GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
         //re-try with default path of QNN binary runtime lib
         _lib_path = "/data/local/tmp/";
+#ifdef _WIN32
+        system_lib_path = _lib_path + "QnnSystem.dll";
+#else
         system_lib_path = _lib_path + "libQnnSystem.so";
+#endif
         _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
         if (nullptr == _system_lib_handle) {
             GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
@@ -2572,7 +1669,7 @@ static void ggml_qnn_logcallback(const char * fmt,
         std::lock_guard<std::mutex> lock(log_mutex);
         memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN);
         vsnprintf(reinterpret_cast<char *const>(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp);
-        GGMLQNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
+        GGMLQNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
     }
 }
 #else
@@ -2580,6 +1677,10 @@ static void ggml_qnn_logcallback(const char * fmt,
                                  QnnLog_Level_t level,
                                  uint64_t timestamp,
                                  va_list argp) {
+    GGML_UNUSED(fmt);
+    GGML_UNUSED(level);
+    GGML_UNUSED(timestamp);
+    GGML_UNUSED(argp);
 }
 #endif
 
@@ -2594,20 +1695,20 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         GGMLQNN_LOG_DEBUG("load QNN system lib successfully\n");
     }
 
-    std::string bakend_lib_path = _lib_path + _backend_name;
-    if (0 == _lib_path_to_backend_id.count(bakend_lib_path)) {
-        int is_load_ok = load_backend(bakend_lib_path, saver_config);
+    std::string backend_lib_path = _lib_path + _backend_name;
+    if (0 == _lib_path_to_backend_id.count(backend_lib_path)) {
+        int is_load_ok = load_backend(backend_lib_path, saver_config);
         if (0 != is_load_ok) {
             GGMLQNN_LOG_WARN("failed to load QNN backend\n");
             return 2;
         }
     }
 
-    backend_id = _lib_path_to_backend_id[bakend_lib_path];
+    backend_id = _lib_path_to_backend_id[backend_lib_path];
     if (0 == _loaded_backend.count(backend_id) ||
         0 == _loaded_lib_handle.count(backend_id)) {
         GGMLQNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, loaded lib_handle count=%zu\n",
-              bakend_lib_path.c_str(),
+              backend_lib_path.c_str(),
               _loaded_backend.count(backend_id),
               _loaded_lib_handle.count(backend_id));
         return 3;
@@ -2661,7 +1762,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
             if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
                     _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) {
                 GGMLQNN_LOG_WARN("unable to create profile handle in the backend\n");
-                return 7;
+                return 6;
             } else {
                 GGMLQNN_LOG_DEBUG("initialize qnn profile successfully\n");
             }
@@ -2677,10 +1778,16 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         }
     }
 
+#if defined(__ANDROID__) || defined(__linux__)
     _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+#elif defined(_WIN32)
+    _rpc_lib_handle = dlopen("libcdsprpc.dll", RTLD_NOW | RTLD_LOCAL);
+#else
+#error "ggml-qnn only support WoA, Android, Linux"
+#endif
     if (nullptr == _rpc_lib_handle) {
         GGMLQNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror());
-        return 9;
+        return 8;
     } else {
         GGMLQNN_LOG_DEBUG("load rpcmem lib successfully\n");
         set_rpcmem_initialized(true);
@@ -2694,7 +1801,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         || nullptr == _pfn_rpc_mem_to_fd) {
         GGMLQNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror());
         dlclose(_rpc_lib_handle);
-        return 10;
+        return 9;
     }
 
     if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy
@@ -2706,7 +1813,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
                                &_qnn_context_handle);
     if (nullptr == _qnn_context_handle) {
         GGMLQNN_LOG_WARN("why failed to initialize qnn context, error:%s\n", strerror(errno));
-        return 8;
+        return 10;
     } else {
         GGMLQNN_LOG_DEBUG("initialize qnn context successfully\n");
     }
@@ -2716,7 +1823,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
         GGMLQNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices);
         QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
-        for (int i = 0; i < p_info->v1.numHwDevices; i++) {
+        for (size_t i = 0; i < p_info->v1.numHwDevices; i++) {
             GGMLQNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId,
                          infos[i].v1.deviceType, infos[i].v1.numCores);
             QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
@@ -2728,7 +1835,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
                              chipinfo.socModel, qnn_get_socmodel_desc(chipinfo.socModel), \
                              htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize);
             struct qcom_socinfo * socinfo = qnn_get_socinfo_from_socmodel(chipinfo.socModel);
-            g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize };
+            g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {}};
             if (nullptr != socinfo) {
                 memcpy(g_qnn_mgr[QNN_BACKEND_NPU].socinfo.soc_desc, socinfo->soc_desc, sizeof(socinfo->soc_desc));
                 GGMLQNN_LOG_INFO("soc info:%s", socinfo->soc_desc);
@@ -2881,17 +1988,17 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend devi
     if (error != QNN_SUCCESS) {
         GGMLQNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s",
                       ggml_backend_qnn_get_devname(device), graph_name.c_str(),
-                      qnn_get_error_string(error));
+                      ggmlqnn_get_error_string(error));
         return error;
     }
 
-    GGMLQNN_LOG_INFO("[%s]create graph %s succeed", ggml_backend_qnn_get_devname(device), graph_name.c_str());
+    GGMLQNN_LOG_DEBUG("[%s]create graph %s succeed", ggml_backend_qnn_get_devname(device), graph_name.c_str());
     _qnn_graph_handle = graph_handle;
     return QNN_SUCCESS;
 }
 
 int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation,
-                                   const QnnGraph_Config_t ** graph_configs) {
+                                 const QnnGraph_Config_t ** graph_configs) {
     int result = 0;
 
     if (nullptr == graph_name) {
@@ -2941,7 +2048,103 @@ int qnn_instance::finalize_qnn_graph() {
     return 0;
 }
 
-static uint8_t * create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) {
+int qnn_instance::init_htp_perfinfra() {
+    QnnDevice_Infrastructure_t device_infra = nullptr;
+    int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra);
+    if (error != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to get qnn device infra\n");
+        return 1;
+    }
+
+    QnnHtpDevice_Infrastructure_t * htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
+    QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra;
+    uint32_t power_configid = 1;
+    uint32_t device_id = 0;
+    uint32_t core_id = 0;
+    htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid);
+    _qnn_htp_perfinfra = htp_perfinfra;
+    _qnn_power_configid = power_configid;
+
+    return 0;
+}
+
+int qnn_instance::set_rpc_polling() {
+    if (_qnn_rpc_pollingtime > 0) {
+        QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingtime;
+        memset(&rpc_pollingtime, 0, sizeof(rpc_pollingtime));
+        rpc_pollingtime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
+        rpc_pollingtime.rpcPollingTimeConfig = _qnn_rpc_pollingtime;
+        const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&rpc_pollingtime, nullptr};
+        if (_qnn_htp_perfinfra) {
+            _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
+        }
+    }
+    return 0;
+}
+
+int qnn_instance::set_high_performance_mode() {
+    if (nullptr == _qnn_htp_perfinfra) {
+        GGMLQNN_LOG_DEBUG("perf intra is null\n");
+        return 1;
+    }
+
+    QnnHtpPerfInfrastructure_PowerConfig_t power_config;
+    memset(&power_config, 0, sizeof(power_config));
+    power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
+    power_config.dcvsV3Config.dcvsEnable = 0;
+    power_config.dcvsV3Config.setDcvsEnable = 1;
+    power_config.dcvsV3Config.contextId = _qnn_power_configid;
+    power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
+    power_config.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False
+    power_config.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False
+    power_config.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False
+    power_config.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable
+    power_config.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False
+    // set Sleep latency parameter
+    uint32_t latencyValue = 40;
+    power_config.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec
+    // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
+    power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
+    power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    // set power config with different performance parameters
+    const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&power_config, nullptr};
+
+    _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
+
+    return 0;
+}
+
+void qnn_instance::probe_device_meminfo() {
+    size_t candidate_size   = 0;
+    uint8_t * rpc_buffer    = nullptr;
+    const int SIZE_IN_MB    = (1 << 20);
+    size_t probe_slots[]    = {1024, 1536, 2048 - 48, 2048};
+    size_t probe_counts     = sizeof(probe_slots) / sizeof(size_t);
+    for (size_t idx = 0; idx < probe_counts; idx++) {
+        rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem_internal(probe_slots[idx] * SIZE_IN_MB, 4));
+        if (nullptr == rpc_buffer) {
+            GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
+            break;
+        } else {
+            candidate_size = probe_slots[idx];
+            free_rpcmem(rpc_buffer);
+            rpc_buffer = nullptr;
+        }
+    }
+    if (candidate_size > _rpcmem_capacity)
+        _rpcmem_capacity = candidate_size;
+
+    free_rpcmem();
+    _rpcmem_usage = 0;
+    GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
+}
+
+uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) {
     if (nullptr == instance || nullptr == ggml_tensor || nullptr == qnn_tensor) {
         GGMLQNN_LOG_WARN("invalid params\n");
         return nullptr;
@@ -2960,7 +2163,7 @@ static uint8_t * create_rpc_buffer(qnn_instance * instance, const ggml_tensor *
     return qnn_rpcbuffer;
 }
 
-static void print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+void ggmlqnn_print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     //skip sanity check of params
     if (nullptr != func_name && nullptr != ctx) {
         GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
@@ -2986,45 +2189,30 @@ static void dump_op_info(const struct ggml_tensor * tensor) {
     struct ggml_tensor       * src1 = tensor->src[1];
     struct ggml_tensor       * dst  = const_cast<ggml_tensor *>(tensor);
     GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), ggml_type_name(tensor->type));
-    print_tensors_info(nullptr, nullptr, src0, src1, dst);
-}
-
-//TODO: currently only support offloading 2D matrix to QNN backend
-static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, uint32_t * ggml_dimensions, uint32_t rank) {
-    if (rank > GGML_MAX_DIMS) {
-        GGMLQNN_LOG_WARN("invalid params");
-        return;
-    }
-    if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) {
-        GGMLQNN_LOG_WARN("invalid params");
-        return;
-    }
-    for (size_t idx = 0; idx < GGML_MAX_DIMS; idx++)
-        qnn_dimensions[idx] = ggml_dimensions[idx];
-
-    if (rank >= 2) {
-        qnn_dimensions[rank - 1] = ggml_dimensions[rank - 2];
-        qnn_dimensions[rank - 2] = ggml_dimensions[rank - 1];
-    }
+    ggmlqnn_print_tensors_info(nullptr, nullptr, src0, src1, dst);
 }
 
 // =================================================================================================
 //  section-6: implementation of ggml-qnn backend
 // =================================================================================================
 //TODO: refine this function as it is a performance hotspot/bottleneck function
-static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
+static bool ggml_qnn_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * tensor) {
     if (tensor->op == GGML_OP_NONE) {
         return true;
     }
     if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE
-    || tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW
-    || tensor->op == GGML_OP_PERMUTE) {
+        || tensor->op == GGML_OP_TRANSPOSE
+        || tensor->op == GGML_OP_VIEW
+        || tensor->op == GGML_OP_PERMUTE
+        ) {
         return false;
     }
 
-    //TODO: support other op
-    bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT)
-            || (tensor->op == GGML_OP_MUL));
+    //TODO: add other op here
+    bool supported_op = ((tensor->op == GGML_OP_ADD)
+                         || (tensor->op == GGML_OP_MUL_MAT)
+                         || (tensor->op == GGML_OP_MUL)
+                        );
     if (!supported_op) {
         return false;
     }
@@ -3032,20 +2220,25 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     struct ggml_tensor * src0 = tensor->src[0];
     struct ggml_tensor * src1 = tensor->src[1];
 
-    const int64_t ne00 = tensor->src[0]->ne[0];
-    const int64_t ne01 = tensor->src[0]->ne[1];
+    const int64_t ne00  = tensor->src[0]->ne[0];
+    const int64_t ne01  = tensor->src[0]->ne[1];
 
-    const int64_t ne10 = tensor->src[1]->ne[0];
-    const int64_t ne11 = tensor->src[1]->ne[1];
+    const int64_t ne10  = tensor->src[1]->ne[0];
+    const int64_t ne11  = tensor->src[1]->ne[1];
 
-    const int64_t ne0 = tensor->ne[0];
-    const int64_t ne1 = tensor->ne[1];
+    const int64_t ne0   = tensor->ne[0];
+    const int64_t ne1   = tensor->ne[1];
 
-    const uint32_t src0_rank = ggml_get_tensor_rank(src0);
-    const uint32_t src1_rank = ggml_get_tensor_rank(src1);
+    const uint32_t src0_rank = ggml_n_dims(src0);
+    const uint32_t src1_rank = ggml_n_dims(src1);
+    GGML_UNUSED(ne01);
+    GGML_UNUSED(ne10);
+    GGML_UNUSED(ne11);
+    GGML_UNUSED(ne0);
+    GGML_UNUSED(ne1);
 
     if (tensor->op == GGML_OP_ADD) {
-        //dump_tensors_info(tensor);
+        //dump_op_info(tensor);
         if (!ggml_are_same_shape(src0, src1)) {
             return false;
         }
@@ -3056,27 +2249,31 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     }
 
     if (tensor->op == GGML_OP_MUL_MAT) {
-        dump_op_info(tensor);
+        //dump_op_info(tensor);
         if (src0_rank != src1_rank) // make QNN SDK happy
             return false;
-        if (src0_rank < 2) // make QNN SDK happy
+        if (src0_rank < 2) // QNN's limitation, make QNN SDK happy
             return false;
-        if (src0_rank > 3) //TODO: 4D matrix
+        if (4 == src0_rank) //TODO: 4D matrix mulmat in CT
             return false;
         if ((src1->ne[2] != src0->ne[2]) || (src1->ne[3] != src0->ne[3])) // make QNN SDK happy
             return false;
 
-        //TODO: support more data type in func ggml_qnn_mul_mat(...)
-        //src0: q4_0, q6_k, ...
-        //src1: f32
-        //dst : f32
-        return  (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
-                && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
-                && (src0->type == src1->type) && (src0->type == tensor->type);
+        if (ctx->device == QNN_BACKEND_NPU)
+            if (2 == src0_rank)
+                return (src0->type == GGML_TYPE_F32
+                    || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
+                    || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
+                   ) && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
+           else
+                return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
+        else
+            return (src0->type == GGML_TYPE_F32   || ggml_is_quantized(src0->type))
+                    && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
     }
 
     if (tensor->op == GGML_OP_MUL) {
-        //dump_tensors_info(tensor);
+        //dump_op_info(tensor);
         if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix
             return false;
         return  (src0->type == GGML_TYPE_F32)
@@ -3084,483 +2281,135 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
                 && (tensor->type == src1->type);
     }
 
-    return  (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
-            && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
-            && (src0->type == src1->type) && (src0->type == tensor->type);
-}
-
-/*
- * provide a general skeleton to offload ggml op to QNN backend: a single node contains 2 input
- * tensor and 1 output tensor
-*/
-static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
-    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
-    enum ggml_status result                     = GGML_STATUS_SUCCESS;
-    bool graph_initialized                      = false;
-    qnn_instance * instance                     = nullptr;
-    ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *)backend->context;
-    Qnn_GraphHandle_t graph_handle              = nullptr;
-    Qnn_Tensor_t * p_tensor0                    = nullptr;
-    Qnn_Tensor_t * p_tensor1                    = nullptr;
-    Qnn_Tensor_t * p_tensor2                    = nullptr;
-    Qnn_Param_t qnn_params[]                    = {};
-    const ggml_tensor * src0                    = op->src[0];
-    const ggml_tensor * src1                    = op->src[1];
-    ggml_tensor * dst                           = op;
-
-    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-    instance                                    = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-
-    size_t qnn_op_index = get_qnn_op_index(op);
-    GGML_ASSERT(qnn_op_index < std::size(k_op_caps));
-    const char * qnn_op_name = k_op_caps[qnn_op_index].qnn_op_name;
-    std::string ggml_op_name_string = std::string("ggml_") + ggml_op_name(op->op);
-    const char * ggml_op_name = ggml_op_name_string.c_str();
-
-    qnn_perf op_perf                            = qnn_perf(ggml_op_name);
-    op_perf.start();
-
-    std::string graph_name;
-    get_graph_key_from_op(op, graph_name);
-    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
-        graph_initialized = true;
-        qnn_res_t & graph_item = instance->_qnn_graph_map[graph_name];
-        graph_handle = std::get<0>(graph_item);
-        qnn_tensors_t & tensor = std::get<1>(graph_item);
-        p_tensor0     = tensor[0];
-        p_tensor1     = tensor[1];
-        p_tensor2     = tensor[2];
-    } else {
-        p_tensor0 = ggml_qnn_create_compute_tensor(src0);
-        p_tensor1 = ggml_qnn_create_compute_tensor(src1);
-        p_tensor2 = ggml_qnn_create_compute_tensor(dst);
-    }
-    print_tensors_info(__func__, ctx, src0, src1, dst);
-
-    //ensure QNN tensor has correct tensor type
-    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
-
-    //save the original dimensions of qnn tensors
-    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
-    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
-    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
-
-    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
-
-    if (!graph_initialized) {
-        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
-        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
-            return;
-        }
-        graph_handle = instance->get_qnn_graph_handle();
-
-        if (enable_npu_rpc) {
-            QNN_VER_PTR(*p_tensor0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {.data=nullptr, .dataSize=0};
-
-            QNN_VER_PTR(*p_tensor1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {.data=nullptr, .dataSize=0};
-
-            QNN_VER_PTR(*p_tensor2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*p_tensor2)->clientBuf = {.data=nullptr, .dataSize=0};
-        }
-
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
-
-        if (enable_npu_rpc) {
-            uint8_t * qnn_rpcbuffer_0 = create_rpc_buffer(instance, src0, p_tensor0, true);
-            uint8_t * qnn_rpcbuffer_1 = create_rpc_buffer(instance, src1, p_tensor1, true);
-            uint8_t * qnn_rpcbuffer_2 = create_rpc_buffer(instance, dst, p_tensor2, false);
-            if (nullptr == qnn_rpcbuffer_0 || nullptr == qnn_rpcbuffer_1 || nullptr == qnn_rpcbuffer_2) {
-                GGMLQNN_LOG_INFO("create rpc buffer failure\n");
-                //TODO: potential memory leak although it shouldn't happen
-                return;
-            }
-        } else {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
-        }
-
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *p_tensor2
-        };
-        Qnn_OpConfig_t op_config = {
-                QNN_OPCONFIG_VERSION_1, .v1 = {
-                        ggml_op_name,
-                        QNN_OP_PACKAGE_NAME_QTI_AISW,
-                        qnn_op_name,
-                        0,
-                        qnn_params,
-                        2,
-                        tensor_inputs,
-                        1,
-                        tensor_outputs
-                }
-        };
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
-        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            tensor_inputs, 2,
-                                                            tensor_outputs, 1,
-                                                            nullptr, nullptr));
-
-        if (enable_npu_rpc) {
-            uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer = %p\n", qnn_rpcbuffer);
-            if (nullptr != qnn_rpcbuffer) {
-                memcpy(dst->data, qnn_rpcbuffer, ggml_nbytes(dst));
-            }
-        }
-
-        qnn_tensors_t ggml_op_add_tensors;
-        ggml_op_add_tensors.reserve(3);
-        ggml_op_add_tensors.push_back(p_tensor0);
-        ggml_op_add_tensors.push_back(p_tensor1);
-        ggml_op_add_tensors.push_back(p_tensor2);
-
-        auto  graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
-        instance->_qnn_graph_map[graph_name] = graph_item;
-
-    } else {
-        Qnn_DataType_t src0_qnn_type    = QNN_DATATYPE_FLOAT_32;
-        Qnn_DataType_t src1_qnn_type    = QNN_DATATYPE_FLOAT_32;
-        Qnn_DataType_t dst_qnn_type     = QNN_DATATYPE_FLOAT_32;
-
-        src0_qnn_type                   = qnn_datatype_from_ggml_datatype(src0->type);
-        src1_qnn_type                   = qnn_datatype_from_ggml_datatype(src1->type);
-        dst_qnn_type                    = qnn_datatype_from_ggml_datatype(dst->type);
-
-        uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-                                         (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
-        uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-                                         (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
-        uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-                                         (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
-
-        QNN_VER_PTR(*p_tensor0)->dimensions  = dimensions_input_0;
-        QNN_VER_PTR(*p_tensor0)->rank        = ggml_get_tensor_rank(src0);
-        QNN_VER_PTR(*p_tensor0)->dataType    = src0_qnn_type;
-
-        QNN_VER_PTR(*p_tensor1)->dimensions  = dimensions_input_1;
-        QNN_VER_PTR(*p_tensor1)->rank        = ggml_get_tensor_rank(src1);
-        QNN_VER_PTR(*p_tensor1)->dataType    = src1_qnn_type;
-
-        QNN_VER_PTR(*p_tensor2)->dimensions  = dimensions_output;
-        QNN_VER_PTR(*p_tensor2)->rank        = ggml_get_tensor_rank(dst);
-        QNN_VER_PTR(*p_tensor2)->dataType    = dst_qnn_type;
-
-        if (enable_npu_rpc) {
-            //TODO: NPU RPC feature will failed with test-backend-ops
-            uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
-            if (nullptr != qnn_buffer_0) {
-                memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
-            }
-
-            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor1)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
-            if (nullptr != qnn_buffer_1) {
-                memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
-            }
-        } else {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
-        }
-
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *p_tensor2
-        };
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            tensor_inputs, 2,
-                                                            tensor_outputs, 1,
-                                                            nullptr, nullptr));
-
-        if (enable_npu_rpc) {
-            //TODO:NPU RPC feature will failed with test-backend-ops
-            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
-            if (nullptr != qnn_buffer_2) {
-                memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
-            }
-        }
-    }
-
-    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
-    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
-    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
-    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
-    op_perf.info();
-}
-
-/*
- * the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
- * than ggml_qnn_general_node.
- * matrix transpose and type trait are required for offload mulmat to QNN backend,
- * so it's a standalone function. accordingly, this is another typical skeleton for offload other
- * ggml ops to QNN backend
- *
- * MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, should focus on MUL_MAT.
- *
- * have three kinds of MUL_MAT to compute:
- * mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
- * mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
- * mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, ...) and src1 is F32, src0 -> f32 in src0', then src0' * src1
-*/
-static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
-    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
-    bool graph_initialized                      = false;
-    qnn_perf op_perf                            = qnn_perf("ggml_qnn_mul_mat");
-    qnn_instance * instance                     = nullptr;
-    ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *) backend->context;
-    Qnn_GraphHandle_t graph_handle              = nullptr;
-    Qnn_Tensor_t * p_tensor0                    = nullptr;
-    Qnn_Tensor_t * p_tensor1                    = nullptr;
-    Qnn_Tensor_t * p_tensor2                    = nullptr;
-    Qnn_Tensor_t * p_param_tensor               = nullptr;
-    Qnn_Tensor_t * p_tensor2_transpose          = nullptr;
-    const ggml_tensor * src0                    = op->src[0];
-    const ggml_tensor * src1                    = op->src[1];
-    ggml_tensor       * dst                     = op;
-
-    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-    instance                                    = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-    op_perf.start();
-
-    uint32_t src0_rank = ggml_get_tensor_rank(src0);
-    uint32_t src1_rank = ggml_get_tensor_rank(src1);
-    GGML_ASSERT(src0_rank == src1_rank);
-    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation
-
-    std::string graph_name;
-    get_graph_key_from_op(op, graph_name);
-    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
-        graph_initialized       = true;
-        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
-        graph_handle            = std::get<0>(graph_item);
-        qnn_tensors_t & tensors = std::get<1>(graph_item);
-        p_tensor0               = tensors[0];
-        p_tensor1               = tensors[1];
-        p_tensor2               = tensors[2];
-        p_param_tensor          = tensors[3];
-        p_tensor2_transpose     = tensors[4];
-    } else {
-        p_tensor0 = ggml_qnn_create_general_tensor(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor1 = ggml_qnn_create_general_tensor(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor2 = ggml_qnn_create_general_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-    }
-
-    print_tensors_info(__func__, ctx, src0, src1, dst);
-
-    //ensure QNN tensor has correct tensor type
-    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
-
-    //save the original dimensions of qnn tensors
-    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
-    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
-    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
-
-    if (!graph_initialized) {
-        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
-        /*
-         there are two key-points in properly handling how to offload mulmat to the QNN backend in ggml-qnn
-         1. transpose
-            a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from:
-            struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
-            which like this:
-            +---+---+
-            | 0 | 1 |
-            +---+---+
-            | 2 | 3 |
-            +---+---+
-            | 4 | 5 |
-            +---+---+
-            with
-                ne[0] = 2
-                ne[1] = 3
-            there are different dimension order between ggml tensor and qnn tensor
-
-          2. QNN's MatMul can only support input tensors with rank >= 2
-
-        there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose operation when offloading mulmat to QNN backend.
-        */
-
-        //step-1: create qnn graph
-        error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
-                                              graph_name.c_str(), nullptr, &graph_handle);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
-            return;
-        }
-        //step-2: create param tensor for mulmat of 2d matrix
-        const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
-                {0},
-                {1, 0},
-                {0, 2, 1},
-                {0, 1, 3, 2},
-        };
-        uint32_t param_tensor_dims[1]   = {src0_rank};
-        p_param_tensor = ggml_qnn_create_general_tensor(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32,
-                                                        1, param_tensor_dims,
-                                                        (void *) (param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
-
-        //step-3: create compute tensor from ggml tensor
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
-
-        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
-
-        //step-4: create a transpose tensor
-        uint32_t tensor2_transpose_dims[GGML_MAX_DIMS] = {};
-        p_tensor2_transpose = ggml_qnn_create_general_tensor(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        get_qnn_dimensions_from_ggml_dimensions(tensor2_transpose_dims, tensor_2_dimensions, ggml_get_tensor_rank(dst));
-        //save pointer because the dimensions of tensor p_tensor2_transpose will be changed later
-        uint32_t * tensor2_dimensions_transpose = QNN_VER_PTR(*p_tensor2_transpose)->dimensions;
-        //update dimensions of tensor p_tensor2_transpose to make QNN SDK happy
-        QNN_VER_PTR(*p_tensor2_transpose)->dimensions = tensor2_transpose_dims;
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2_transpose));
-
-        //step-5: compose qnn graph: add mat_mul node
-        Qnn_Param_t out_0_params[] = {
-                {QNN_PARAMTYPE_SCALAR,
-                           QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
-                             .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}
-                }
-        };
-
-        Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0, *p_tensor1};
-        Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
-        Qnn_OpConfig_t out_0 = {
-                QNN_OPCONFIG_VERSION_1, .v1 =
-                        {"ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
-                         1,
-                         out_0_params,
-                         2,
-                         out_0_inputs,
-                         1,
-                         out_0_outputs}
-        };
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
-
-        //step-5: compose qnn graph: add transpose node
-        Qnn_Param_t out_trans1_0_params[] = {
-                {(Qnn_ParamType_t) 1,
-                 "perm", .tensorParam = *p_param_tensor
-                }
-        };
-        Qnn_Tensor_t out_trans1_0_inputs[]  = {*p_tensor2_transpose};
-        Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
-        Qnn_OpConfig_t out_trans1_0 = {
-                QNN_OPCONFIG_VERSION_1,
-                .v1 =  {"ggmlqnn_mulmat_transpose_opconfig",
-                        "qti.aisw",
-                        QNN_OP_TRANSPOSE, 1,
-                        out_trans1_0_params,
-                        1,
-                        out_trans1_0_inputs,
-                        1,
-                        out_trans1_0_outputs}
-        };
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
-
-        //step-6: finalize qnn graph and execute qnn graph
-        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
-        Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0, *p_tensor1};
-        Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                               input_tensors_0, 2,
-                                               output_tensors_0, 1,
-                                               NULL, NULL));
-
-        qnn_tensors_t ggml_op_mulmat_tensors;
-        ggml_op_mulmat_tensors.reserve(5);
-        ggml_op_mulmat_tensors.push_back(p_tensor0);
-        ggml_op_mulmat_tensors.push_back(p_tensor1);
-        ggml_op_mulmat_tensors.push_back(p_tensor2);
-        ggml_op_mulmat_tensors.push_back(p_param_tensor);
-        ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
-
-        auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
-        instance->_qnn_graph_map[graph_name] = graph_item;
-
-        //avoid cleanup these resource to make test_backend_ops happy
-        //free_qnn_tensor(p_param_tensor);
-        //restore pointer to avoid memory leak
-        QNN_VER_PTR(*p_tensor2_transpose)->dimensions = tensor2_dimensions_transpose;
-        //free_qnn_tensor(p_tensor2_transpose);
-    } else {
-        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
-
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *p_tensor2
-        };
-        //attention:
-        // this is the second technical approach of "how to utilize the Hexagon NPU maximally" through
-        // QNN SDK, details could be found at
-        // https://github.com/kantv-ai/llama.cpp/wiki/mapping-ggml-compute-graph-to-QNN-compute-graph
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                         tensor_inputs, 2,
-                                         tensor_outputs, 1,
-                                         nullptr, nullptr));
-    }
-
-    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
-    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
-    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
-    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
-    op_perf.info();
+    return false;
 }
 
-static bool ggml_qnn_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor) {
+static bool ggml_qnn_compute_forward(ggml_backend_t backend, struct ggml_tensor * dst) {
     ggmlqnn_op_func_t func                = nullptr;
+    ggml_backend_qnn_context * ctx        = (ggml_backend_qnn_context *)backend->context;
 
-    switch (tensor->op) {
+    switch (dst->op) {
+        case GGML_OP_REPEAT:
+            ggml_qnn_repeat(ctx, dst);
+            break;
+        case GGML_OP_GET_ROWS:
+            ggml_qnn_get_rows(ctx, dst);
+            break;
+        case GGML_OP_DUP:
+            ggml_qnn_dup(ctx, dst);
+            break;
         case GGML_OP_ADD:
             func = ggml_qnn_general_node;
             break;
-
-        case GGML_OP_MUL_MAT:
-            func = ggml_qnn_mul_mat;
+        case GGML_OP_ACC:
+            ggml_qnn_acc(ctx, dst);
             break;
-
         case GGML_OP_MUL:
             func = ggml_qnn_general_node;
             break;
-
+        case GGML_OP_DIV:
+            ggml_qnn_div(ctx, dst);
+            break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(dst)) {
+                case GGML_UNARY_OP_GELU:
+                    break;
+                case GGML_UNARY_OP_SILU:
+                    break;
+                case GGML_UNARY_OP_GELU_QUICK:
+                    break;
+                case GGML_UNARY_OP_TANH:
+                    break;
+                case GGML_UNARY_OP_RELU:
+                    break;
+                case GGML_UNARY_OP_HARDSIGMOID:
+                    break;
+                case GGML_UNARY_OP_HARDSWISH:
+                    break;
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_NORM:
+            ggml_qnn_norm(ctx, dst);
+            break;
+        case GGML_OP_GROUP_NORM:
+            ggml_qnn_group_norm(ctx, dst);
+            break;
+        case GGML_OP_CONCAT:
+            ggml_qnn_concat(ctx, dst);
+            break;
+        case GGML_OP_UPSCALE:
+            ggml_qnn_upsample_nearest2d(ctx, dst);
+            break;
+        case GGML_OP_PAD:
+            ggml_qnn_pad(ctx, dst);
+            break;
+        case GGML_OP_ARANGE:
+            ggml_qnn_arange(ctx, dst);
+            break;
+        case GGML_OP_TIMESTEP_EMBEDDING:
+            ggml_qnn_timestep_embedding(ctx, dst);
+            break;
+        case GGML_OP_LEAKY_RELU:
+            ggml_qnn_leaky_relu(ctx, dst);
+            break;
+        case GGML_OP_RMS_NORM:
+            ggml_qnn_rms_norm(ctx, dst);
+            break;
+        case GGML_OP_MUL_MAT:
+            ggml_qnn_mul_mat(ctx, dst);
+            break;
+        case GGML_OP_MUL_MAT_ID:
+            return false;
+        case GGML_OP_SCALE:
+            ggml_qnn_scale(ctx, dst);
+            break;
+        case GGML_OP_SQR:
+            ggml_qnn_sqr(ctx, dst);
+            break;
+        case GGML_OP_CLAMP:
+            ggml_qnn_clamp(ctx, dst);
+            break;
+        case GGML_OP_CPY:
+            ggml_qnn_cpy(ctx, dst);
+            break;
+        case GGML_OP_CONT:
+            ggml_qnn_dup(ctx, dst);
+            break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            break;
+        case GGML_OP_DIAG_MASK_INF:
+            ggml_qnn_diag_mask(ctx, dst, -INFINITY);
+            break;
+        case GGML_OP_SOFT_MAX:
+            ggml_qnn_softmax(ctx, dst);
+            break;
+        case GGML_OP_ROPE:
+            ggml_qnn_rope(ctx, dst);
+            break;
+        case GGML_OP_IM2COL:
+            ggml_qnn_im2col(ctx, dst);
+            break;
+        case GGML_OP_POOL_2D:
+            ggml_qnn_pool2d(ctx, dst);
+            break;
+        case GGML_OP_SUM_ROWS:
+            ggml_qnn_sum_rows(ctx, dst);
+            break;
+        case GGML_OP_ARGSORT:
+            ggml_qnn_argsort(ctx, dst);
+            break;
         default:
             return false;
     }
 
     if (nullptr != func)
-        func(backend, tensor);
+        func(ctx, dst);
 
     return true;
 }
@@ -3598,14 +2447,12 @@ static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) {
 
 static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) {
     ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
-
     return ctx->buffer;
 }
 
 static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
     ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
-    GGML_UNUSED(error);
+    GGML_UNUSED(tensor);
     GGML_UNUSED(ctx);
     return;
 }
@@ -3649,14 +2496,6 @@ static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
     memset(ctx->buffer, value, ctx->buffer_size);
 }
 
-[[maybe_unused]]static void ggml_backend_qnn_buffer_reset(ggml_backend_buffer_t buffer) {
-    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
-    for (auto * sub_buffer : ctx->sub_buffers) {
-        free(sub_buffer);
-    }
-    ctx->sub_buffers.clear();
-}
-
 static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = {
         /* .free_buffer     = */ ggml_backend_qnn_buffer_free_buffer,
         /* .get_base        = */ ggml_backend_qnn_buffer_get_base,
@@ -3666,10 +2505,11 @@ static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = {
         /* .get_tensor      = */ ggml_backend_qnn_buffer_get_tensor,
         /* .cpy_tensor      = */ ggml_backend_qnn_buffer_cpy_tensor,
         /* .clear           = */ ggml_backend_qnn_buffer_clear,
-        /* .reset           = */ NULL,
+        /* .reset           = */ nullptr,
 };
 
 static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
     return "qnn-buffer";
 }
 
@@ -3677,7 +2517,13 @@ static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(
                                   ggml_backend_buffer_type_t buft, size_t size) {
     ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context;
 
+#if defined(__ANDROID__) || defined(__linux__)
     size_t size_page = sysconf(_SC_PAGESIZE);
+#elif defined(_WIN32)
+    SYSTEM_INFO systeminfo;
+    GetSystemInfo(&systeminfo);
+    size_t size_page = systeminfo.dwPageSize;
+#endif
     size_t size_aligned = size;
     if ((size_aligned % size_page) != 0) {
         size_aligned += (size_page - (size_aligned % size_page));
@@ -3697,11 +2543,11 @@ static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_typ
     return 32;
 }
 
-//FIXME: this value is an experimental value on Xiaomi14
+//TODO:not used currently
 static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
     GGML_UNUSED(buft);
 
-    return (2 * (1 << 30));
+    return (2 * (1 << 20));
 }
 
 static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) {
@@ -3777,12 +2623,11 @@ static const char * ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) {
         return "unknown";
     }
     return ctx->name;
-
-    GGML_UNUSED(dev);
 }
 
 static const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) {
     struct ggml_backend_qnn_context * ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
+    static char qnn_device_desc[256];
     if (nullptr == ctx) {
         GGMLQNN_LOG_ERROR("pls check why ctx is null");
         return "unknown";
@@ -3793,7 +2638,9 @@ static const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t d
         std::string dev_desc = std::string(ctx->desc)
                 + std::string(soc_info) + "_" + std::string(htp_arch)
                 + "," + std::string(ctx->socinfo.soc_desc);
-        return dev_desc.c_str();
+        memset(qnn_device_desc, 0, 256);
+        memcpy(qnn_device_desc, dev_desc.c_str(), strlen(dev_desc.c_str()));
+        return qnn_device_desc;
     } else {
         return ctx->desc;
     }
@@ -3855,7 +2702,7 @@ static ggml_backend_t ggml_backend_qnn_device_init_backend(ggml_backend_dev_t de
 
 }
 
-ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) {
+static ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) {
     if (device_index >= GGML_QNN_MAX_DEVICES) {
         GGMLQNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n",
                       device_index, GGML_QNN_MAX_DEVICES - 1);
@@ -3868,10 +2715,11 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) {
                                      /* .alloc_buffer     = */ ggml_backend_qnn_buffer_type_alloc_buffer,
                                      /* .get_alignment    = */ ggml_backend_qnn_buffer_type_get_alignment,
                                      /* .get_max_size     = */ ggml_backend_qnn_buffer_type_get_max_size,
-                                     /* .get_alloc_size   = */ NULL,// defaults to ggml_nbytes
+                                     /* .get_alloc_size   = */ nullptr,// defaults to ggml_nbytes
                                      /* .is_host          = */ ggml_backend_qnn_buffer_is_host
                              },
-            /* .context = */ NULL,
+            /* .device  = */ nullptr,
+            /* .context = */ nullptr,
     };
 
     return &ggml_backend_buffer_type_qnn;
@@ -3890,10 +2738,9 @@ static ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_host_ptr(ggml_b
     GGML_UNUSED(max_tensor_size);
 }
 
-
 static bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
     ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) dev->context;
-    return (ggml_qnn_can_handle_op(op));
+    return (ggml_qnn_can_handle_op(ctx,op));
 }
 
 static bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
@@ -3909,14 +2756,14 @@ static struct ggml_backend_device_i ggml_backend_qnn_device_interface = {
         /* .get_props            = */ ggml_backend_qnn_device_get_props,
         /* .init_backend         = */ ggml_backend_qnn_device_init_backend,
         /* .get_buffer_type      = */ ggml_backend_qnn_device_get_buffer_type,
-        /* .get_host_buffer_type = */ NULL,
+        /* .get_host_buffer_type = */ nullptr,
         /* .buffer_from_host_ptr = */ ggml_backend_qnn_device_buffer_from_host_ptr,
         /* .supports_op          = */ ggml_backend_qnn_device_supports_op,
         /* .supports_buft        = */ ggml_backend_qnn_device_supports_buft,
-        /* .offload_op           = */ NULL,
-        /* .event_new            = */ NULL,
-        /* .event_free           = */ NULL,
-        /* .event_synchronize    = */ NULL,
+        /* .offload_op           = */ nullptr,
+        /* .event_new            = */ nullptr,
+        /* .event_free           = */ nullptr,
+        /* .event_synchronize    = */ nullptr,
 };
 
 static ggml_backend_i ggml_backend_qnn_interface = {
@@ -3964,9 +2811,8 @@ struct ggml_backend_qnn_reg_context {
 };
 
 static const char * ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) {
-    return "ggml-qnn";
-
     GGML_UNUSED(reg);
+    return "ggml-qnn";
 }
 
 static size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) {
@@ -3987,10 +2833,15 @@ static ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg
 static void * ggml_backend_qnn_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
     GGML_UNUSED(reg);
 
-    if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
+    if (nullptr == name)
+        return nullptr;
+
+    const char * slot_name =  "ggml_backend_set_n_threads";
+    //avoid buffer attack rather than strcmp
+    if (0 == std::memcmp(name, slot_name, strlen(slot_name))) {
         return (void *)ggml_backend_qnn_set_n_threads;
     }
-    return NULL;
+    return nullptr;
 }
 
 static const ggml_backend_reg_i ggml_backend_qnn_reg_interface = {
@@ -4057,6 +2908,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
         return g_qnn_mgr[device].backend;
     }
 
+#if defined(__ANDROID__)
     std::string path = qnn_lib_path;
     if (QNN_BACKEND_NPU == device) {
         if (0 == setenv("LD_LIBRARY_PATH",
@@ -4085,6 +2937,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
             GGMLQNN_LOG_ERROR("%s backend setenv failure\n", ggml_backend_qnn_get_devname(device));
         }
     }
+#endif
 
     qnn_instance * instance = nullptr;
     instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, "");
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 63614e6afe110..3d239510b8d63 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -74,7 +74,7 @@ function check_and_download_ndk()
 
 function build_arm64
 {
-    cmake -H. -B./out/android -DGGML_USE_QNN=ON -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
+    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_USE_QNN=ON -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
     cd out/android
     make -j16
     show_pwd
@@ -106,15 +106,15 @@ function check_qnn_libs()
 
 function update_qnn_libs()
 {
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so              ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so                 ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so                 ${REMOTE_PATH}/
-
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so                 ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so          ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so          ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so     ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so              ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so                 ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so                 ${REMOTE_PATH}/
+
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so                 ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so          ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so          ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so     ${REMOTE_PATH}/
 }
 
 
@@ -129,32 +129,37 @@ function build_ggml_qnn()
 }
 
 
-function run_llamacli()
+function prepare_run_on_phone()
 {
+    if [ $# != 1 ]; then
+        print "invalid param"
+        return
+    fi
+    program=$1
+
     check_qnn_libs
 
     if [ -f ./out/android/bin/libggml-qnn.so ]; then
         adb push ./out/android/bin/*.so ${REMOTE_PATH}/
     fi
-    adb push ./out/android/bin/llama-cli ${REMOTE_PATH}/
-    adb shell chmod +x ${REMOTE_PATH}/llama-cli
+    adb push ./out/android/bin/${program} ${REMOTE_PATH}/
+    adb shell chmod +x ${REMOTE_PATH}/${program}
+}
+
+function run_llamacli()
+{
+    prepare_run_on_phone llama-cli
 
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
+               && ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
 
 }
 
 
 function run_llamabench()
 {
-    check_qnn_libs
-
-    if [ -f ./out/android/bin/libggml-qnn.so ]; then
-        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
-    fi
-    adb push ./out/android/bin/llama-bench ${REMOTE_PATH}/
-    adb shell chmod +x ${REMOTE_PATH}/llama-bench
+    prepare_run_on_phone llama-bench
 
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
@@ -165,13 +170,7 @@ function run_llamabench()
 
 function run_test-backend-ops()
 {
-    check_qnn_libs
-
-    if [ -f ./out/android/bin/libggml-qnn.so ]; then
-        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
-    fi
-    adb push ./out/android/bin/test-backend-ops ${REMOTE_PATH}/
-    adb shell chmod +x ${REMOTE_PATH}/test-backend-ops
+    prepare_run_on_phone test-backend-ops
 
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
@@ -179,6 +178,36 @@ function run_test-backend-ops()
 
 }
 
+function run_ut_add()
+{
+    prepare_run_on_phone ggml-qnn-ut
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/ggml-qnn-ut -t GGML_OP_ADD -b $qnnbackend"
+
+}
+
+function run_ut_mulmat()
+{
+    prepare_run_on_phone ggml-qnn-ut
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/ggml-qnn-ut -t GGML_OP_MUL_MAT -b $qnnbackend"
+
+}
+
+function run_ut_mul()
+{
+    prepare_run_on_phone ggml-qnn-ut
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/ggml-qnn-ut -t GGML_OP_MUL -b $qnnbackend"
+
+}
+
 
 function show_usage()
 {
@@ -186,6 +215,9 @@ function show_usage()
     echo "  $0 build"
     echo "  $0 updateqnnlib"
     echo "  $0 run_testop"
+    echo "  $0 run_ut_add       0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_ut_mulmat    0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_ut_mul       0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
     echo "  $0 run_llamacli     0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
     echo "  $0 run_llamabench   0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
     echo -e "\n\n\n"
@@ -213,6 +245,7 @@ elif [ $# == 1 ]; then
     elif [ "$1" == "run_testop" ]; then
         run_test-backend-ops
         exit 0
+
     elif [ "$1" == "updateqnnlib" ]; then
         update_qnn_libs
         exit 0
@@ -233,6 +266,15 @@ elif [ $# == 2 ]; then
     elif [ "$1" == "run_llamabench" ]; then
         run_llamabench
         exit 0
+    elif [ "$1" == "run_ut_add" ]; then
+        run_ut_add
+        exit 0
+    elif [ "$1" == "run_ut_mulmat" ]; then
+        run_ut_mulmat
+        exit 0
+    elif [ "$1" == "run_ut_mul" ]; then
+        run_ut_mul
+        exit 0
     fi
 else
     show_usage

From 4dce1e03d7384e9b0fa1be3f54e97587d2cadb89 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 18 Feb 2025 09:53:57 +0800
Subject: [PATCH 074/200] ggml-qnn: merge QNN RPC feature from
 https://github.com/zhouwg/kantv/blob/ggml-qnn-quantize/core/ggml/llamacpp/ggml-qnn.cpp

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 2974 --------------------------------
 1 file changed, 2974 deletions(-)
 delete mode 100644 ggml/src/ggml-qnn/ggml-qnn.cpp

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
deleted file mode 100644
index c830128f750c8..0000000000000
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ /dev/null
@@ -1,2974 +0,0 @@
-/*
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Qualcomm QNN SDK and reference tech guides could be found at:
- * https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
- * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
- *
- * the implementation of ggml-qnn backend has six sections:
- * section-1 does forward/external declaration,
- * section-2 defines ggml-qnn internal log function
- * section-3 does general helper macro / data structure / function
- * section-4 does QNN helper macro / data structure / function
- * section-5 does ggml-qnn backend helper macro / data structure / function / class
- * section-6 does implementation of ggml-qnn backend according to ggml's backend subsystem
- *
- * currently provide following ggml ops' QNN backend implementation in ggml-qnn-ops.cpp:
- * - GGML_OP_ADD:    this is a simple skeleton, can expand other ggml ops according to expertise
- * - GGML_OP_MUL:    this is a simple skeleton, can expand other ggml ops according to expertise
- * - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex ggml ops accordingly
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#include "ggml-qnn-impl.h"
-#include "ggml-qnn-ops.h"
-// =================================================================================================
-//  section-1: forward/external declaration
-// =================================================================================================
-static int  free_qnn_tensor(Qnn_Tensor_t * tensor);
-static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-typedef void (* ggmlqnn_op_func_t)(ggml_backend_qnn_context * ctx, ggml_tensor * op);
-
-// =================================================================================================
-//  section-2: ggml-qnn internal troubleshooting function
-// =================================================================================================
-void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
-    static std::mutex ggmlqnn_log_internal_mutex;
-    static char s_ggmlqnn_log_internal_buf[GGML_QNN_LOGBUF_LEN];
-
-    GGML_UNUSED(file);
-    {
-        std::lock_guard<std::mutex> lock(ggmlqnn_log_internal_mutex);
-        va_list args;
-        va_start(args, format);
-        int len_prefix = snprintf(s_ggmlqnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line);
-        int len = vsnprintf(s_ggmlqnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args);
-        if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) {
-#if (defined __ANDROID__) || (defined ANDROID)
-            //for Android application(standard APP or command line tool)
-            __android_log_print(ANDROID_LOG_INFO, "ggml-qnn", "%s\n", s_ggmlqnn_log_internal_buf);
-            if (GGML_LOG_LEVEL_INFO == level) {
-                printf("%s\n", s_ggmlqnn_log_internal_buf);
-            }
-#else
-            //for Snapdragon based WoA(Windows on ARM) device or Linux
-            printf("%s\n", s_ggmlqnn_log_internal_buf);
-#endif
-        }
-        va_end(args);
-    }
-}
-
-// =================================================================================================
-//  section-3: general helper macro / data structure / function
-// =================================================================================================
-#if defined(_WIN32)
-static const char * last_func = nullptr;
-static long last_err;
-void * dlopen(const char * dll, int flags) {
-  HINSTANCE h = LoadLibraryA(dll);
-  GGML_UNUSED(flags);
-  if (h == NULL) {
-    last_err  = GetLastError();
-    last_func = "dlopen";
-  }
-  return h;
-}
-
-int dlclose(void * h) {
-  if (!FreeLibrary((HINSTANCE)h)) {
-    last_err  = GetLastError();
-    last_func = "dlclose";
-    return -1;
-  }
-  return 0;
-}
-
-void * dlsym(void * h, const char * name) {
-  FARPROC p = GetProcAddress((HINSTANCE)h, name);
-  if (!p) {
-    last_err  = GetLastError();
-    last_func = "dlsym";
-  }
-  return (void*)(intptr_t)p;
-}
-
-const char * dlerror(void) {
-  static char str[512];
-  if (!last_err) return nullptr;
-
-  snprintf(str, 512, "%s error #%ld", last_func, last_err);
-  last_err  = 0;
-  last_func = NULL;
-
-  return str;
-}
-#endif
-
-static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
-    return offset % alignment == 0 ? offset
-                                   : offset +
-                                     (static_cast<intptr_t>(alignment) -
-                                      offset % static_cast<intptr_t>(alignment));
-}
-
-static size_t get_system_total_memory_in_bytes() {
-#if defined(__ANDROID__) || defined(__linux__)
-    struct sysinfo info = {};
-    if (0 == sysinfo(&info)) {
-        return (info.totalram + info.totalswap) * info.mem_unit;
-    }
-    auto pages = (size_t)sysconf(_SC_PHYS_PAGES);
-    auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
-
-    return pages * page_size;
-#elif defined(_WIN32)
-    //TODO: Snapdragon based WoA(Windows on ARM)
-    return 0;
-#else
-#error "ggml-qnn only support WoA, Android, Linux"
-#endif
-}
-
-static size_t get_system_free_memory_in_bytes() {
-#if defined(__ANDROID__) || defined(__linux__)
-    struct sysinfo info = {};
-    if (0 == sysinfo(&info)) {
-        return (info.freeram + info.freeswap) * info.mem_unit;
-    }
-    auto avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES);
-    auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
-
-    return avail_pages * page_size;
-#elif defined(_WIN32)
-    //TODO: Snapdragon based WoA(Windows on ARM)
-    return 0;
-#else
-#error "ggml-qnn only support WoA, Android, Linux"
-#endif
-}
-
-static size_t ggmlqnn_memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) {
-    if (!dst || !src || !dst_size || !copy_size)
-        return 0;
-
-    size_t min_size = dst_size < copy_size ? dst_size : copy_size;
-
-    memcpy(dst, src, min_size);
-
-    return min_size;
-}
-
-static char * ggmlqnn_strndup(const char * source, size_t maxlen) {
-    return strndup(source, maxlen);
-}
-
-static void * ggmlqnn_host_malloc(size_t n) {
-#if defined(__ANDROID__) || defined(__linux__)
-    void * data = nullptr;
-    int result = posix_memalign((void **)&data, sysconf(_SC_PAGESIZE), n);
-    if (result != 0) {
-        GGMLQNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__);
-        return nullptr;
-    }
-#elif defined(_WIN32)
-    //TODO: Snapdragon based WoA(Windows on ARM)
-    return nullptr;
-#else
-#error "ggml-qnn only support WoA, Android, Linux"
-#endif
-
-    return data;
-}
-
-// =================================================================================================
-//  section-4: QNN helper macro / data structure / function
-// =================================================================================================
-#define QNN_TENSOR_GET_ID(tensor)                       get_qnn_tensorid(tensor)
-#define QNN_TENSOR_GET_NAME(tensor)                     get_qnn_tensorname(tensor)
-#define QNN_TENSOR_GET_TYPE(tensor)                     get_qnn_tensortype(tensor)
-#define QNN_TENSOR_GET_DATA_FORMAT(tensor)              get_qnn_tensor_dataformat(tensor)
-#define QNN_TENSOR_GET_DATA_TYPE(tensor)                get_qnn_tensor_datatype(tensor)
-#define QNN_TENSOR_GET_QUANT_PARAMS(tensor)             get_qnn_tensor_quantparams(tensor)
-#define QNN_TENSOR_GET_RANK(tensor)                     get_qnn_tensor_rank(tensor)
-#define QNN_TENSOR_GET_DIMENSIONS(tensor)               get_qnn_tensor_dimensions(tensor)
-#define QNN_TENSOR_GET_MEM_TYPE(tensor)                 get_qnn_tensor_memtype(tensor)
-#define QNN_TENSOR_GET_CLIENT_BUF(tensor)               get_qnn_tensor_clientbuf(tensor)
-#define QNN_TENSOR_GET_MEM_HANDLE(tensor)               get_qnn_tensor_memhandle(tensor)
-
-#define QNN_TENSOR_SET_ID(tensor, value)                set_qnn_tensor_id(tensor, value)
-#define QNN_TENSOR_SET_NAME(tensor, value)              set_qnn_tensor_name(tensor, value)
-#define QNN_TENSOR_SET_TYPE(tensor, value)              set_qnn_tensor_type(tensor, value)
-#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value)       set_qnn_tensor_dataformat(tensor, value)
-#define QNN_TENSOR_SET_DATA_TYPE(tensor, value)         set_qnn_tensor_datatype(tensor, value)
-#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value)      set_qnn_tensor_quantparams(tensor, value)
-#define QNN_TENSOR_SET_RANK(tensor, value)              set_qnn_tensor_rank(tensor, value)
-#define QNN_TENSOR_SET_DIMENSIONS(tensor, value)        set_qnn_tensor_dimensions(tensor, value)
-#define QNN_TENSOR_SET_MEM_TYPE(tensor, value)          set_qnn_tensor_memtype(tensor, value)
-#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value)        set_qnn_tensor_clientbuf(tensor, value)
-#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value)        set_qnn_tensor_memhandle(tensor, value)
-
-static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.id;
-    }
-
-    return 0u;
-}
-
-static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.name;
-    }
-    return nullptr;
-}
-
-static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.type;
-    }
-    return QNN_TENSOR_TYPE_UNDEFINED;
-}
-
-static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.dataFormat;
-    }
-    return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER;
-}
-
-static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.dataType;
-    }
-    return QNN_DATATYPE_UNDEFINED;
-}
-
-static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.quantizeParams;
-    }
-    return QNN_QUANTIZE_PARAMS_INIT;
-}
-
-static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.rank;
-    }
-    return 0u;
-}
-
-static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.dimensions;
-    }
-    return nullptr;
-}
-
-static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.memType;
-    }
-    return QNN_TENSORMEMTYPE_UNDEFINED;
-}
-
-static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.id = id;
-    }
-}
-
-static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.name = name;
-    }
-}
-
-static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.type = type;
-    }
-}
-
-static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.dataFormat = format;
-    }
-}
-
-static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.dataType = dataType;
-    }
-}
-
-static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.quantizeParams = params;
-    }
-}
-
-static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.rank = rank;
-    }
-}
-
-static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.dimensions = dims;
-    }
-}
-
-static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.memType = memType;
-    }
-}
-
-static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.clientBuf = clientBuf;
-    }
-}
-
-static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.memHandle = handle;
-    }
-}
-
-static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
-    int err = 0;
-
-    dst.version = src.version;
-    QNN_TENSOR_SET_NAME(dst, ggmlqnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size()));
-    if (nullptr == QNN_TENSOR_GET_NAME(dst)) {
-        return 1;
-    }
-    QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src));
-    QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src));
-    QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src));
-    QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src));
-    QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src));
-
-    if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) {
-        Qnn_ClientBuffer_t client_buf = {nullptr, 0};
-        QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf);
-    } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) {
-        QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr);
-    } else {
-        return 1;
-    }
-
-    Qnn_QuantizeParams_t src_qparam      = QNN_TENSOR_GET_QUANT_PARAMS(src);
-    Qnn_QuantizationEncoding_t encoding  = src_qparam.quantizationEncoding;
-    if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
-        Qnn_QuantizeParams_t src_qparam_cpy       = src_qparam;
-        Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding;
-        Qnn_ScaleOffset_t ** scale_offset         = &axis_scale_offset.scaleOffset;
-        size_t scale_offset_size = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t);
-        *scale_offset            = (Qnn_ScaleOffset_t *)malloc(scale_offset_size);
-        ggmlqnn_memscpy(*scale_offset,
-                        scale_offset_size,
-                        src_qparam.axisScaleOffsetEncoding.scaleOffset,
-                        scale_offset_size);
-        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy);
-    } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
-        Qnn_QuantizeParams_t src_qparam_cpy           = src_qparam;
-        Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding;
-        size_t scale_size                          = bwaxis_scale_offset.numElements * sizeof(float);
-        float ** scales                            = &bwaxis_scale_offset.scales;
-        int32_t ** offsets                         = &bwaxis_scale_offset.offsets;
-        *scales                                    = (float *)malloc(scale_size);
-        ggmlqnn_memscpy(*scales, scale_size, src_qparam.bwAxisScaleOffsetEncoding.scales, scale_size);
-
-        if (bwaxis_scale_offset.offsets != nullptr) {
-            size_t offset_size = bwaxis_scale_offset.numElements * sizeof(int32_t);
-            *offsets           = (int32_t *)malloc(offset_size);
-            ggmlqnn_memscpy(*offsets, offset_size, src_qparam.bwAxisScaleOffsetEncoding.offsets, offset_size);
-        }
-        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy);
-    } else {
-        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam);
-    }
-
-    uint32_t rank = QNN_TENSOR_GET_RANK(src);
-    QNN_TENSOR_SET_RANK(dst, rank);
-    size_t dim_size       = GGML_MAX_DIMS * sizeof(uint32_t);
-    uint32_t * dimensions = (uint32_t *)malloc(dim_size);
-    if (nullptr == dimensions) {
-        GGMLQNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src));
-        return 1;
-    }
-    ggmlqnn_memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size);
-    QNN_TENSOR_SET_DIMENSIONS(dst, dimensions);
-
-    return err;
-}
-
-static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
-    int err = 0;
-    free((void *) QNN_TENSOR_GET_NAME(*tensor));
-    Qnn_QuantizeParams_t src_qparam     = QNN_TENSOR_GET_QUANT_PARAMS(*tensor);
-    Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding;
-    if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
-        free(src_qparam.axisScaleOffsetEncoding.scaleOffset);
-    } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
-        free(src_qparam.bwAxisScaleOffsetEncoding.scales);
-        if (src_qparam.bwAxisScaleOffsetEncoding.offsets != nullptr) {
-            free(src_qparam.bwAxisScaleOffsetEncoding.offsets);
-        }
-    }
-    free(QNN_TENSOR_GET_DIMENSIONS(*tensor));
-    free(tensor);
-
-    return err;
-}
-
-const char * ggmlqnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
-    // file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/api_error_codes.html
-    switch (qnn_error_code) {
-        case QNN_SUCCESS:
-            return "QNN_SUCCESS";
-        case QNN_COMMON_ERROR_GENERAL:
-            return "QNN_COMMON_ERROR_GENERAL";
-
-            // QnnGraph_Error_t
-        case QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE:
-            return "QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE";
-        case QNN_GRAPH_ERROR_MEM_ALLOC:
-            return "QNN_GRAPH_ERROR_MEM_ALLOC";
-        case QNN_GRAPH_ERROR_INVALID_ARGUMENT:
-            return "QNN_GRAPH_ERROR_INVALID_ARGUMENT";
-        case QNN_GRAPH_ERROR_INVALID_HANDLE:
-            return "QNN_GRAPH_ERROR_INVALID_HANDLE";
-        case QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST:
-            return "QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST";
-        case QNN_GRAPH_ERROR_INVALID_NAME:
-            return "QNN_GRAPH_ERROR_INVALID_NAME";
-        case QNN_GRAPH_ERROR_INVALID_TENSOR:
-            return "QNN_GRAPH_ERROR_INVALID_TENSOR";
-        case QNN_GRAPH_ERROR_INVALID_OP_CONFIG:
-            return "QNN_GRAPH_ERROR_INVALID_OP_CONFIG";
-        case QNN_GRAPH_ERROR_SET_PROFILE:
-            return "QNN_GRAPH_ERROR_SET_PROFILE";
-        case QNN_GRAPH_ERROR_UNCONNECTED_NODE:
-            return "QNN_GRAPH_ERROR_UNCONNECTED_NODE";
-        case QNN_GRAPH_ERROR_CREATE_FAILED:
-            return "QNN_GRAPH_ERROR_CREATE_FAILED";
-        case QNN_GRAPH_ERROR_OPTIMIZATION_FAILED:
-            return "QNN_GRAPH_ERROR_OPTIMIZATION_FAILED";
-        case QNN_GRAPH_ERROR_FINALIZE_FAILED:
-            return "QNN_GRAPH_ERROR_FINALIZE_FAILED";
-        case QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED:
-            return "QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED";
-        case QNN_GRAPH_ERROR_GRAPH_FINALIZED:
-            return "QNN_GRAPH_ERROR_GRAPH_FINALIZED";
-        case QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL:
-            return "QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL";
-        case QNN_GRAPH_ERROR_SIGNAL_IN_USE:
-            return "QNN_GRAPH_ERROR_SIGNAL_IN_USE";
-        case QNN_GRAPH_ERROR_ABORTED:
-            return "QNN_GRAPH_ERROR_ABORTED";
-        case QNN_GRAPH_ERROR_PROFILE_IN_USE:
-            return "QNN_GRAPH_ERROR_PROFILE_IN_USE";
-        case QNN_GRAPH_ERROR_TIMED_OUT:
-            return "QNN_GRAPH_ERROR_TIMED_OUT";
-        case QNN_GRAPH_ERROR_SUBGRAPH:
-            return "QNN_GRAPH_ERROR_SUBGRAPH";
-        case QNN_GRAPH_ERROR_DISABLED:
-            return "QNN_GRAPH_ERROR_DISABLED";
-        case QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE:
-            return "QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE";
-        case QNN_GRAPH_ERROR_TENSOR_SPARSITY:
-            return "QNN_GRAPH_ERROR_TENSOR_SPARSITY";
-        case QNN_GRAPH_ERROR_EARLY_TERMINATION:
-            return "QNN_GRAPH_ERROR_EARLY_TERMINATION";
-        case QNN_GRAPH_ERROR_INVALID_CONTEXT:
-            return "QNN_GRAPH_ERROR_INVALID_CONTEXT";
-
-            //QQnnTensor_Error_t
-            //Invalid context/graph handle in creating tensor
-        case QNN_TENSOR_ERROR_INVALID_HANDLE:
-            return "QNN_TENSOR_ERROR_INVALID_HANDLE";
-            //Tensor with specified credentials not registered with a context/graph
-        case QNN_TENSOR_ERROR_DOES_NOT_EXIST:
-            return "QNN_TENSOR_ERROR_DOES_NOT_EXIST";
-            // (deprecated) Tensor has already been registered with backend
-        case QNN_TENSOR_ERROR_ALREADY_EXISTS:
-            return "QNN_TENSOR_ERROR_ALREADY_EXISTS";
-            // Invalid tensor param.
-        case QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM:
-            return "QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM";
-            // This tensor param is currently unsupported
-        case QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM:
-            return "QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM";
-            // Tensor provided for update is invalid
-        case QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE:
-            return "QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE";
-
-            // QnnOpPackage_Error_t
-        case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED:
-            return "QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED";
-        case QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED:
-            return "QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED";
-        case QNN_OP_PACKAGE_ERROR_INVALID_HANDLE:
-            return "QNN_OP_PACKAGE_ERROR_INVALID_HANDLE";
-        case QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE:
-            return "QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE";
-        case QNN_OP_PACKAGE_ERROR_INVALID_INFO:
-            return "QNN_OP_PACKAGE_ERROR_INVALID_INFO";
-        case QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE:
-            return "QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE";
-        case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT:
-            return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT";
-
-        default:
-            return "unknown QNN error";
-    }
-}
-
-// helper function to create an operation config
-Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
-                                       Qnn_Param_t * params, uint32_t num_params,
-                                       Qnn_Tensor_t * inputs, uint32_t num_inputs,
-                                       Qnn_Tensor_t * outputs, uint32_t num_outputs) {
-    Qnn_OpConfigV1_t v1 = {name, package, type,
-                           num_params, params,
-                           num_inputs, inputs,
-                           num_outputs, outputs
-    };
-    Qnn_OpConfig_t opcfg = {QNN_OPCONFIG_VERSION_1, {v1}};
-
-    return opcfg;
-}
-
-// =================================================================================================
-//  section-5:ggml-qnn backend helper macro / data structure / function / class
-// =================================================================================================
-//file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
-static struct qcom_socinfo g_qnn_soc_info_table[] = {
-        /* Qualcomm SnapDragon 7 Gen 1 */
-        [SM7450] = {
-                .soc_model         = SM7450,
-                .htp_arch          = V69,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 7 Gen 1"},
-
-        /* Qualcomm SnapDragon 888 */
-        [SM8350] = {
-                .soc_model         = SM8350,
-                .htp_arch          = V68,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 888 "},
-
-        /* Qualcomm SnapDragon 8 Gen 1 */
-        [SM8450] = {
-                .soc_model         = SM8450,
-                .htp_arch          = V69,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8 Gen 1"},
-
-        /* Qualcomm SnapDragon 8 Gen 1+ */
-        [SM8475] = {
-                .soc_model         = SM8475,
-                .htp_arch          = V69,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8 Gen 1+"},
-
-        /* Qualcomm SnapDragon 8 Gen 2 */
-        [SM8550] = {
-                .soc_model         = SM8550,
-                .htp_arch          = V73,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8 Gen 2"},
-
-        /* Qualcomm SnapDragon 8 Gen 3 */
-        [SM8650] = {
-                .soc_model         = SM8650,
-                .htp_arch          = V75,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8 Gen 3 "},
-
-        /* Qualcomm SnapDragon 8 Gen 4 */
-        [SM8750] = {
-                .soc_model         = SM8750,
-                .htp_arch          = V79,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8 Gen 4"},
-
-#if defined(_WIN32)
-        /* Qualcomm SnapDragon 7c Gen 2 */
-        [SC7280X] = {
-                .soc_model         = SC7280X,
-                .htp_arch          = V68,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 7c Gen 2"},
-
-        /* Qualcomm SnapDragon 8cx Gen 3 */
-        [SC8280X] = {
-                .soc_model         = SC8280X,
-                .htp_arch          = V68,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8cx Gen 3"},
-
-        /* Qualcomm SnapDragon 8cx Gen 4 */
-        [SC8380XP] = {
-                .soc_model         = SC8380XP,
-                .htp_arch          = V73,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8cx Gen 4"},
-#endif
-
-};
-
-//the following helper funcs are used to ensure every QNN tensor name is unique
-static std::atomic<int32_t>  g_ggmltensor_idx(0);
-static void reset_idx() {
-    g_ggmltensor_idx = 0;
-}
-
-static void inc_idx() {
-    g_ggmltensor_idx++;
-}
-
-static int32_t get_idx() {
-    return g_ggmltensor_idx.load();
-}
-
-// file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/quantization.html
-// CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend
-// GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend
-// HTP - Choose a quantized model. Quantized models are required when running on the HTP backend
-// DSP - Choose a quantized model. Quantized models are required when running on the DSP backend
-// HTA - Choose a quantized model. Quantized models are required when running on the HTA backend
-static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
-        [QNN_BACKEND_CPU] = {.device               = 0,
-                .threads              = 1,
-                .name                 = "qnn-cpu",
-                .desc                 = "Qualcomm Kryo CPU",
-#if defined(_WIN32)
-                .lib                  = "QnnCpu.dll",
-#else
-                .lib                  = "libQnnCpu.so",
-#endif
-                .instance             = nullptr,
-                .backend              = nullptr,
-                .raw_interface        = {},
-                .raw_system_interface = {},
-                .socinfo              = {}},
-
-        [QNN_BACKEND_GPU] = {.device               = 1,
-                .threads              = 1,
-                .name                 = "qnn-gpu",
-                .desc                 = "Qualcomm Adreno GPU",
-#if defined(_WIN32)
-                .lib                  = "QnnGpu.dll",
-#else
-                .lib                  = "libQnnGpu.so",
-#endif
-                .instance             = nullptr,
-                .backend              = nullptr,
-                .raw_interface        = {},
-                .raw_system_interface = {},
-                .socinfo              = {}},
-
-        [QNN_BACKEND_NPU] = {.device               = 2,
-                .threads              = 1,
-                .name                 = "qnn-npu",
-                .desc                 = "Qualcomm NPU(Hexagon Tensor Processor)",
-#if defined(_WIN32)
-                .lib                  = "QnnHtp.dll",
-#else
-                .lib                  = "libQnnHtp.so",
-#endif
-                .instance             = nullptr,
-                .backend              = nullptr,
-                .raw_interface        = {},
-                .raw_system_interface = {},
-                .socinfo              = {}},
-};
-
-const qnn_op_caps_t ggmlqnn_k_op_caps[] = {
-        {}, // GGML_OP_NONE
-        {}, // GGML_OP_DUP
-        {
-                // GGML_OP_ADD
-                QNN_OP_ELEMENT_WISE_ADD,
-                2,
-        },
-        {}, // GGML_OP_ADD1
-        {}, // GGML_OP_ACC
-        {}, // GGML_OP_SUB
-        {
-                // GGML_OP_MUL
-                QNN_OP_ELEMENT_WISE_MULTIPLY,
-                2,
-        },
-        {}, // GGML_OP_DIV
-        {}, // GGML_OP_SQR
-        {}, // GGML_OP_SQRT
-        {}, // GGML_OP_LOG
-        {}, // GGML_OP_SIN
-        {}, // GGML_OP_COS
-        {}, // GGML_OP_SUM
-        {}, // GGML_OP_SUM_ROWS
-        {}, // GGML_OP_MEAN
-        {}, // GGML_OP_ARGMAX
-        {}, // GGML_OP_COUNT_EQUAL
-        {}, // GGML_OP_REPEAT
-        {}, // GGML_OP_REPEAT_BACK
-        {}, // GGML_OP_CONCAT
-        {}, // GGML_OP_SILU_BACK
-        {}, // GGML_OP_NORM
-        {}, // GGML_OP_RMS_NORM
-        {}, // GGML_OP_RMS_NORM_BACK
-        {}, // GGML_OP_GROUP_NORM
-        {
-                // GGML_OP_MUL_MAT
-                QNN_OP_MAT_MUL,
-                2,
-        },
-        {}, // GGML_OP_MUL_MAT_ID
-        {}, // GGML_OP_OUT_PROD
-        {}, // GGML_OP_SCALE
-        {}, // GGML_OP_SET
-        {}, // GGML_OP_CPY
-        {}, // GGML_OP_CONT
-        {}, // GGML_OP_RESHAPE
-        {}, // GGML_OP_VIEW
-        {}, // GGML_OP_PERMUTE
-        {}, // GGML_OP_TRANSPOSE
-        {}, // GGML_OP_GET_ROWS
-        {}, // GGML_OP_GET_ROWS_BACK
-        {}, // GGML_OP_DIAG
-        {}, // GGML_OP_DIAG_MASK_INF
-        {}, // GGML_OP_DIAG_MASK_ZERO
-        {}, // GGML_OP_SOFT_MAX
-        {}, // GGML_OP_SOFT_MAX_BACK
-        {}, // GGML_OP_ROPE
-        {}, // GGML_OP_ROPE_BACK
-        {}, // GGML_OP_CLAMP
-        {}, // GGML_OP_CONV_TRANSPOSE_1D
-        {}, // GGML_OP_IM2COL
-        {}, // GGML_OP_IM2COL_BACK
-        {}, // GGML_OP_CONV_TRANSPOSE_2D
-        {}, // GGML_OP_POOL_1D
-        {}, // GGML_OP_POOL_2D
-        {}, // GGML_OP_POOL_2D_BACK
-        {}, // GGML_OP_UPSCALE
-        {}, // GGML_OP_PAD
-        {}, // GGML_OP_PAD_REFLECT_1D
-        {}, // GGML_OP_ARANGE
-        {}, // GGML_OP_TIMESTEP_EMBEDDING
-        {}, // GGML_OP_ARGSORT
-        {}, // GGML_OP_LEAKY_RELU
-        {}, // GGML_OP_FLASH_ATTN_EXT
-        {}, // GGML_OP_FLASH_ATTN_BACK
-        {}, // GGML_OP_SSM_CONV
-        {}, // GGML_OP_SSM_SCAN
-        {}, // GGML_OP_WIN_PART
-        {}, // GGML_OP_WIN_UNPART
-        {}, // GGML_OP_GET_REL_POS
-        {}, // GGML_OP_ADD_REL_POS
-        {}, // GGML_OP_RWKV_WKV6
-        {}, // GGML_OP_GATED_LINEAR_ATTN
-        {}, // GGML_OP_UNARY
-        {}, // GGML_OP_MAP_UNARY
-        {}, // GGML_OP_MAP_BINARY
-        {}, // GGML_OP_MAP_CUSTOM1_F32
-        {}, // GGML_OP_MAP_CUSTOM2_F32
-        {}, // GGML_OP_MAP_CUSTOM3_F32
-        {}, // GGML_OP_MAP_CUSTOM1
-        {}, // GGML_OP_MAP_CUSTOM2
-        {}, // GGML_OP_MAP_CUSTOM3
-        {}, // GGML_OP_CROSS_ENTROPY_LOSS
-        {}, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
-        {}, // GGML_OP_OPT_STEP_ADAMW
-        {}, // GGML_UNARY_OP_ABS
-        {}, // GGML_UNARY_OP_SGN
-        {}, // GGML_UNARY_OP_NEG
-        {}, // GGML_UNARY_OP_STEP
-        {}, // GGML_UNARY_OP_TANH
-        {}, // GGML_UNARY_OP_ELU
-        {}, // GGML_UNARY_OP_RELU
-        {}, // GGML_UNARY_OP_SIGMOID
-        {}, // GGML_UNARY_OP_GELU
-        {}, // GGML_UNARY_OP_GELU_QUICK
-        {}, // GGML_UNARY_OP_SILU
-        {}, // GGML_UNARY_OP_HARDSWISH
-        {}, // GGML_UNARY_OP_HARDSIGMOID
-        {}, // GGML_UNARY_OP_EXP
-};
-
-static const char * qnn_get_socmodel_desc(uint32_t soc_model) {
-    switch (soc_model) {
-        case SM7450:
-            return "SM7450";
-        case SM8350:
-            return "SM8350";
-        case SM8450:
-            return "SM8450";
-        case SM8475:
-            return "SM8475";
-        case SM8550:
-            return "SM8550";
-        case SM8650:
-            return "SM8650";
-        case SM8750:
-            return "SM8750";
-        default:
-            return "unknown";
-    }
-}
-
-static const char * qnn_get_htparch_desc(size_t htp_arch) {
-    switch (htp_arch) {
-        case V68:
-            return "QCOM_HTP_V68";
-        case V69:
-            return "QCOM_HTP_V69";
-        case V73:
-            return "QCOM_HTP_V73";
-        case V75:
-            return "QCOM_HTP_V75";
-        case V79:
-            return "QCOM_HTP_V79";
-        default:
-            return "unknown";
-    }
-}
-
-static struct qcom_socinfo * qnn_get_socinfo_from_socmodel(uint32_t soc_model) {
-    size_t items = sizeof(g_qnn_soc_info_table) / sizeof(g_qnn_soc_info_table[0]);
-    for (size_t idx = 0; idx < items; idx++) {
-        if (soc_model == g_qnn_soc_info_table[idx].soc_model) {
-            return &g_qnn_soc_info_table[idx];
-        }
-    }
-    return nullptr;
-}
-
-
-static const char * ggml_get_type_name(ggml_type type) {
-    const struct ggml_type_traits * traits = ggml_get_type_traits(type);
-    return traits->type_name;
-}
-
-static const char * get_ggml_type_name(ggml_type type) {
-    const auto * traits = ggml_get_type_traits(type);
-    return traits->type_name;
-}
-
-// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
-Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
-    switch (ggmltype) {
-        case GGML_TYPE_F16:
-            return QNN_DATATYPE_FLOAT_16;
-        case GGML_TYPE_F32:
-            return QNN_DATATYPE_FLOAT_32;
-        case GGML_TYPE_I8:
-            return QNN_DATATYPE_INT_8;
-        case GGML_TYPE_Q8_0:
-            return QNN_DATATYPE_SFIXED_POINT_8;
-        case GGML_TYPE_Q4_0:
-            return QNN_DATATYPE_SFIXED_POINT_4;
-        default:
-            break;
-    }
-    return QNN_DATATYPE_UNDEFINED;
-}
-
-static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
-    switch (qnn_type) {
-        case QNN_DATATYPE_FLOAT_32:
-            return GGML_TYPE_F32;
-        case QNN_DATATYPE_FLOAT_16:
-            return GGML_TYPE_F16;
-        case QNN_DATATYPE_UINT_32:
-        case QNN_DATATYPE_INT_32:
-            return GGML_TYPE_I32;
-        case QNN_DATATYPE_INT_16:
-            return GGML_TYPE_I16;
-        case QNN_DATATYPE_INT_8:
-            return GGML_TYPE_I8;
-        case QNN_DATATYPE_SFIXED_POINT_8:
-            return GGML_TYPE_Q8_0;
-        case QNN_DATATYPE_SFIXED_POINT_4:
-            return GGML_TYPE_Q4_0;
-        default:
-            break;
-    }
-    return GGML_TYPE_COUNT;
-}
-
-static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, const uint32_t * ggml_dimensions, uint32_t rank) {
-    if (rank > GGML_MAX_DIMS) {
-        GGMLQNN_LOG_WARN("invalid params");
-        return;
-    }
-    if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) {
-        GGMLQNN_LOG_WARN("invalid params");
-        return;
-    }
-    for (size_t idx = 0; idx < GGML_MAX_DIMS; idx++)
-        qnn_dimensions[idx] = ggml_dimensions[idx];
-
-    if (rank >= 2) {
-        qnn_dimensions[rank - 1] = ggml_dimensions[rank - 2];
-        qnn_dimensions[rank - 2] = ggml_dimensions[rank - 1];
-    }
-}
-
-Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
-                                                     Qnn_TensorType_t qnn_tensor_type,
-                                                     Qnn_DataType_t qnn_data_type,
-                                                     uint32_t rank, uint32_t * dims,
-                                                     void * data, uint32_t data_size,
-                                                     bool b_transpose) {
-    Qnn_ErrorHandle_t error         = QNN_SUCCESS;
-    char tensor_name[GGML_MAX_NAME] = {};
-
-    //ensure the tensor name is unique
-    if (nullptr != name) {
-        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", get_idx());
-    } else {
-        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, get_idx());
-    }
-    GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
-    inc_idx();
-
-    uint32_t reverse_dims[GGML_MAX_DIMS]    = {};
-    uint32_t transpose_dims[GGML_MAX_DIMS]  = {};
-    uint32_t * tensor_dims                  = nullptr;
-    //case 1:use dims info from ggml tensor
-    if (nullptr != tensor) {
-        //there are different dimension order between ggml tensor and qnn tensor
-        for (size_t idx = 0; idx < rank; idx++) {
-            reverse_dims[idx] = (uint32_t)tensor->ne[rank - 1 - idx];
-        }
-        tensor_dims = reverse_dims;
-    }
-    //case 2: use user's specified tensor_dims
-    if (nullptr != dims) {
-        tensor_dims = dims;
-    }
-    //case 3: transpose for dst tensor
-    if (b_transpose) {
-        GGML_ASSERT(tensor != nullptr); //ensure ggml_tensor is not nullptr for this special case
-
-        get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_n_dims(tensor));
-        tensor_dims = transpose_dims;
-#if 0
-        for (size_t idx = 0; idx < 4; idx++) {
-            GGMLQNN_LOG_DEBUG("origin dim[%d]=%d\n", idx, reverse_dims[idx]);
-        }
-        for (size_t idx = 0; idx < 4; idx++) {
-            GGMLQNN_LOG_DEBUG("trans  dim[%d]=%d\n", idx, transpose_dims[idx]);
-        }
-#endif
-    }
-
-    Qnn_Tensor_t qnn_tensor = {
-            .version= QNN_TENSOR_VERSION_1,
-            {.v1= {
-                    .id = 0,
-                    .name = tensor_name,
-                    .type = qnn_tensor_type,
-                    .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
-                    .dataType = qnn_data_type,
-                    .quantizeParams = {.encodingDefinition = QNN_DEFINITION_UNDEFINED,
-                                       .quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED,
-                                       {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}},
-                    .rank = rank,
-                    .dimensions = tensor_dims,
-                    .memType = QNN_TENSORMEMTYPE_RAW,
-                    .clientBuf = {.data = nullptr, .dataSize = 0}
-            }
-            }
-    };
-    if (nullptr != name) {
-        QNN_VER_PTR(qnn_tensor)->name = name;
-    }
-    Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
-    if (nullptr == p_qnn_tensor) {
-        GGMLQNN_LOG_WARN("calloc failed");
-        return nullptr;
-    }
-    error = deep_copy_qnn_tensors(qnn_tensor, * p_qnn_tensor);
-    if (error != QNN_SUCCESS) {
-        free(p_qnn_tensor);
-        GGMLQNN_LOG_WARN("init tensor failed");
-        return  nullptr;
-    }
-    QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {data, data_size};
-
-    return p_qnn_tensor;
-}
-
-Qnn_Tensor_t * ggmlqnn_create_compute_tensor(const ggml_tensor * tensor) {
-    uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
-                             (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
-    Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
-    Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
-
-    if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
-        qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
-    } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
-        qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
-    }
-
-    qnn_data_type = ggmlqnn_datatype_from_ggml_datatype(tensor->type);
-    Qnn_Tensor_t * p_qnn_tensor = ggmlqnn_create_general_tensor(tensor, nullptr,
-                                  qnn_tensor_type, qnn_data_type,
-                                  ggml_n_dims(tensor), dimensions,
-                                  nullptr, 0);
-
-    return p_qnn_tensor;
-}
-
-void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
-    const ggml_tensor * src0        = op->src[0];
-    const ggml_tensor * src1        = op->src[1];
-    ggml_tensor * dst               = op;
-    const enum ggml_type src0_type  = src0->type;
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne12);
-    GGML_ASSERT(ne3 == ne13);
-    GGML_ASSERT(nb00 == ggml_type_size(src0_type));
-    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
-
-    const int64_t ne_plane = ne01 * ne00;
-    const size_t desired_size = ((GGML_TYPE_F32 == src0_type) ? 0 : ne03 * ne02 * ne_plane * sizeof(float));
-    ctx->desired_size   = desired_size;
-    if (ctx->work_size < desired_size) {
-        ctx->work_data.reset(new char[desired_size]);
-        ctx->work_size  = desired_size;
-    }
-    ctx->n_threads = std::thread::hardware_concurrency();
-    void * wdata = ctx->work_data.get();
-    // convert src0 to float
-    if (src0_type != GGML_TYPE_F32) {
-        const auto * type_traits        = ggml_get_type_traits(src0_type);
-        ggml_to_float_t const to_float  = type_traits->to_float;
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                const void * x          = (char *)src0->data + i02 * nb02 + i03 * nb03;
-                float * const wplane    = (float *)wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
-
-                const int min_cols_per_thread = 4096;
-                const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1);
-                const int n_threads = std::max(
-                        std::min(ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1);
-                for (int i = 1; i < n_threads; i++) {
-                    const int64_t start = i * ne01 / n_threads;
-                    const int64_t end   = (i + 1) * ne01 / n_threads;
-                    if (start < end) {
-                        ctx->tasks.push_back(std::async(std::launch::async, [=]() {
-                            for (int64_t i01 = start; i01 < end; i01++) {
-                                to_float((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
-                            }
-                        }));
-                    }
-                }
-                {
-                    // reuse the current thread for the first task
-                    const int64_t start = 0;
-                    const int64_t end = ne01 / n_threads;
-                    for (int64_t i01 = start; i01 < end; i01++) {
-                        to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
-                    }
-                }
-            }
-        }
-
-        // wait for all tasks to finish
-        for (auto &task: ctx->tasks) {
-            task.get();
-        }
-        ctx->tasks.clear();
-    }
-    return wdata;
-}
-
-static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) {
-    char buffer[256] = {};
-    const char * type_name = get_ggml_type_name(tensor->type);
-    int len = 0;
-    switch (ggml_n_dims(tensor)) {
-        case 1:
-            len = snprintf(buffer, sizeof(buffer), "%ldx1%s", (long)tensor->ne[0], type_name);
-            break;
-        case 2:
-            len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name);
-            break;
-        case 3:
-            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
-                           (long)tensor->ne[2], type_name);
-            break;
-        case 4:
-        default:
-            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
-                           (long)tensor->ne[2], (long)tensor->ne[3], type_name);
-            break;
-    }
-    GGML_ASSERT(len > 0 && len < (int)sizeof(buffer));
-    output.append(buffer, len);
-}
-
-size_t ggmlqnn_get_opcaps_size() {
-    return std::size(ggmlqnn_k_op_caps);
-}
-
-size_t ggmlqnn_get_op_index(const ggml_tensor * tensor) {
-    if (tensor->op == GGML_OP_UNARY) {
-        return static_cast<size_t>(GGML_OP_COUNT) + static_cast<size_t>(ggml_get_unary_op(tensor));
-    }
-
-    return tensor->op;
-}
-
-static size_t ggmlqnn_get_op_input_param_count(const ggml_tensor * op) {
-    auto op_index = ggmlqnn_get_op_index(op);
-    GGML_ASSERT(op_index < std::size(ggmlqnn_k_op_caps));
-    return ggmlqnn_k_op_caps[op_index].input_param_count;
-}
-
-void ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & output) {
-    GGML_ASSERT(op->op != GGML_OP_NONE);
-    output += ggml_op_desc(op);
-    output += get_ggml_type_name(op->type);
-    size_t param_count = ggmlqnn_get_op_input_param_count(op);
-    for (size_t i = 0; i < param_count; ++i) {
-        auto * input = op->src[i];
-        if (!input) {
-            break;
-        }
-        output += '_';
-        append_tensor_dimensions(input, output);
-    }
-}
-
-template<typename Fn>
-Fn load_qnn_functionpointers(void * handle, const char * function_name) {
-    return reinterpret_cast<Fn>(dlsym(handle, function_name));
-}
-
-std::mutex qnn_instance::_init_mutex;
-std::unordered_map<qnn_instance::BackendIdType, void *> qnn_instance::_loaded_lib_handle;
-std::unordered_map<std::string, qnn_instance::BackendIdType> qnn_instance::_lib_path_to_backend_id;
-std::unordered_map<qnn_instance::BackendIdType, const QnnInterface_t *> qnn_instance::_loaded_backend;
-
-void * qnn_instance::alloc_rpcmem_internal(size_t bytes, size_t alignment) {
-    if (!_rpcmem_initialized) {
-        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
-        return nullptr;
-    }
-
-    auto allocate_bytes = static_cast<int32_t>(bytes + alignment);
-    void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes);
-    if (nullptr == buf) {
-        GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
-        return nullptr;
-    }
-
-    auto aligned_buf = reinterpret_cast<void *>(ggmlqnn_align_to(alignment,
-                                                reinterpret_cast<intptr_t>(buf)));
-    bool status = _rpcmem_store_map.insert(std::pair<void *, void *>(aligned_buf, buf)).second;
-    if (!status) {
-        GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
-        _pfn_rpc_mem_free(buf);
-    }
-    return aligned_buf;
-}
-
-void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
-    if (_rpcmem_usage > (_rpcmem_capacity - 8)) { // reserve 8Mbytes in rpc mempool
-        GGMLQNN_LOG_WARN("rpc mempool capcaity: %d MB, usage: %d MB", _rpcmem_capacity, _rpcmem_usage);
-        return nullptr;
-    }
-
-    auto aligned_buf = alloc_rpcmem_internal(bytes, alignment);
-    if (nullptr == aligned_buf)
-        return nullptr;
-    _rpcmem_usage_map.insert(std::pair<void *, size_t>(aligned_buf, bytes));
-
-    size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
-    rpcmem_usage_in_bytes += bytes;
-    _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
-    return aligned_buf;
-}
-
-void qnn_instance::free_rpcmem(void * buf) {
-    size_t rpcbuffer_size = 0;
-    if (!_rpcmem_initialized) {
-        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
-    } else if (0 == _rpcmem_store_map.count(buf)) {
-        GGMLQNN_LOG_WARN("no allocated tensor\n");
-    } else {
-        GGMLQNN_LOG_DEBUG("free rpc mem %p", _rpcmem_store_map[buf]);
-        for (std::unordered_map<void *, size_t>::iterator it = _rpcmem_usage_map.begin();
-             it != _rpcmem_usage_map.end();
-             it++) {
-            void * rpcbuffer = it->first;
-            if (buf == rpcbuffer) {
-                rpcbuffer_size = it->second;
-                size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
-                rpcmem_usage_in_bytes -= rpcbuffer_size;
-                _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
-            }
-        }
-        if (rpcbuffer_size != 0) {
-            _rpcmem_usage_map.erase(buf);
-        } else {
-            GGMLQNN_LOG_WARN("it shouldn't happen, pls check why?");
-        }
-        _pfn_rpc_mem_free(_rpcmem_store_map[buf]);
-        _rpcmem_store_map.erase(buf);
-    }
-}
-
-void qnn_instance::free_rpcmem() {
-    if (_rpcmem_store_map.empty()) {
-        GGMLQNN_LOG_WARN("no rpcmem allocated\n");
-        return;
-    }
-
-    for (std::unordered_map<void *, void *>::iterator it = _rpcmem_store_map.begin();
-         it != _qnn_mem_set.end();
-         it++) {
-        void * rpcbuffer = it->second;
-        GGMLQNN_LOG_DEBUG("free rpc buffer %p", rpcbuffer);
-        _pfn_rpc_mem_free(rpcbuffer);
-    }
-    _rpcmem_store_map.clear();
-    _rpcmem_usage_map.clear();
-    _rpcmem_usage = 0;
-}
-
-int32_t qnn_instance::rpcmem_to_fd(void * buf) {
-    int32_t mem_fd = -1;
-    if (!is_rpcmem_initialized()) {
-        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
-    } else {
-        mem_fd = _pfn_rpc_mem_to_fd(buf);
-    }
-
-    return mem_fd;
-}
-
-int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
-    if (nullptr == p_data || (nullptr == p_tensor)) {
-        GGMLQNN_LOG_WARN("invalid param\n");
-        return 1;
-    }
-
-    if (!is_rpcmem_initialized()) {
-        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
-        return 2;
-    }
-
-    if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
-        GGMLQNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
-        return 3;
-    }
-
-    int32_t mem_fd = rpcmem_to_fd(p_data);
-    if (-1 == mem_fd) {
-        GGMLQNN_LOG_WARN("failed to get file descriptor\n");
-        return 4;
-    }
-    GGMLQNN_LOG_DEBUG("mem_fd %d\n", mem_fd);
-    Qnn_MemDescriptor_t descriptor = {
-            {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr},
-            QNN_VER_PTR(*p_tensor)->dataType,
-            QNN_MEM_TYPE_ION,
-            {{mem_fd}}};
-    Qnn_MemHandle_t handle = nullptr;
-    int error = QNN_SUCCESS;
-    error = _qnn_interface.qnn_mem_register(
-            _qnn_context_handle,
-            &descriptor,
-            /*numDescriptors=*/1,
-            &handle);
-    if (error != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), strerror(error));
-        return 5;
-    } else {
-        GGMLQNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
-    }
-    QNN_VER_PTR(*p_tensor)->memHandle = handle;
-    _qnn_mem_set.insert((std::pair<void*, Qnn_MemHandle_t>(p_data, handle)));
-
-    return 0;
-}
-
-Qnn_MemHandle_t  qnn_instance::register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type) {
-    if (!p_data) {
-        GGMLQNN_LOG_WARN("invalid param");
-        return nullptr;
-    }
-
-    if (!is_rpcmem_initialized()) {
-        GGMLQNN_LOG_WARN("rpc memory not initialized");
-        return nullptr;
-    }
-
-    if (is_rpcmem_registered(p_data)) {
-        GGMLQNN_LOG_WARN("rpc memory already registered");
-        return _qnn_rpc_buffer_to_handles[p_data];
-    }
-
-    auto mem_fd = rpcmem_to_fd(p_data);
-    if (mem_fd == -1) {
-        GGMLQNN_LOG_WARN("failed to get file descriptor");
-        return nullptr;
-    }
-
-    GGMLQNN_LOG_DEBUG("mem_fd %d", mem_fd);
-    Qnn_MemDescriptor_t descriptor = {
-            {rank, dimensions, nullptr},
-            data_type, QNN_MEM_TYPE_ION,
-            {{mem_fd}}
-    };
-    Qnn_MemHandle_t handle = nullptr;
-    auto error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle);
-    if (error != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error));
-        return nullptr;
-    }
-
-    _qnn_rpc_buffer_to_handles.insert({p_data, handle});
-    GGMLQNN_LOG_DEBUG("successfully register shared memory handler: %p", handle);
-    return handle;
-}
-
-void * qnn_instance::get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) {
-    for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin();
-         it != _qnn_mem_set.end();
-         it++) {
-        Qnn_MemHandle_t mem_handle = it->second;
-        if (it->second == mem_handle) {
-            return it->first;
-        }
-    }
-    GGMLQNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle);
-    return nullptr;
-}
-
-void qnn_instance::unregister_rpcmem() {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-
-    if (_qnn_mem_set.empty()) {
-        GGMLQNN_LOG_WARN("no rpcmem registered\n");
-    }
-
-    for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin();
-         it != _qnn_mem_set.end();
-         it++) {
-        Qnn_MemHandle_t mem_handle = it->second;
-        error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error));
-        } else {
-            GGMLQNN_LOG_DEBUG("unregister shared memory ok");
-        }
-    }
-    _qnn_mem_set.clear();
-}
-
-void qnn_instance::unregister_rpcmem(Qnn_MemHandle_t mem_handle) {
-    Qnn_ErrorHandle_t error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
-    if (error != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error));
-    }
-
-    auto it = std::find_if(_qnn_mem_set.begin(), _qnn_mem_set.end(),
-                           [mem_handle](const auto &kv) { return kv.second == mem_handle; });
-    if (it == _qnn_mem_set.end()) {
-        GGMLQNN_LOG_WARN("failed to find shared memory handler: %p", mem_handle);
-        return;
-    }
-
-    _qnn_mem_set.erase(it);
-}
-
-bool qnn_instance::is_rpcmem_allocated(void * buf) {
-    return _rpcmem_store_map.count(buf) != 0U;
-}
-
-int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    GGMLQNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str());
-
-    void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
-    if (nullptr == lib_handle) {
-        GGMLQNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror());
-        return 1;
-    }
-
-    auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(
-                               lib_handle,
-                               "QnnInterface_getProviders");
-    if (nullptr == get_providers) {
-        GGMLQNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror());
-        return 2;
-    }
-
-    // get QnnInterface Providers
-    std::uint32_t num_providers = 0;
-    const QnnInterface_t ** provider_list = nullptr;
-    error = get_providers(&provider_list, &num_providers);
-    if (error != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error));
-        return 3;
-    }
-    GGMLQNN_LOG_DEBUG("num_providers=%d\n", num_providers);
-    if (num_providers != _required_num_providers) {
-        GGMLQNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers);
-        return 4;
-    }
-
-    if (nullptr == provider_list) {
-        GGMLQNN_LOG_WARN("failed to get qnn interface providers\n");
-        return 5;
-    }
-    bool found_valid_interface = false;
-    QNN_INTERFACE_VER_TYPE qnn_interface;
-    for (size_t idx = 0; idx < num_providers; idx++) {
-        if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major &&
-            QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) {
-            found_valid_interface = true;
-            qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME;
-            break;
-        }
-    }
-
-    if (!found_valid_interface) {
-        GGMLQNN_LOG_WARN("unable to find a valid qnn interface\n");
-        return 6;
-    } else {
-        GGMLQNN_LOG_INFO("find a valid qnn interface\n");
-    }
-    set_qnn_raw_interface(qnn_interface);
-
-    BackendIdType backend_id = provider_list[0]->backendId;
-    _lib_path_to_backend_id[lib_path] = backend_id;
-    if (_loaded_backend.count(backend_id) > 0) {
-        GGMLQNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n",
-              lib_path.c_str(), backend_id);
-    }
-    _loaded_backend[backend_id] = provider_list[0];
-    if (_loaded_lib_handle.count(backend_id) > 0) {
-        GGMLQNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]);
-        int dlclose_error = dlclose(_loaded_lib_handle[backend_id]);
-        if (dlclose_error != 0) {
-            GGMLQNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror());
-        }
-    }
-    _loaded_lib_handle[backend_id] = lib_handle;
-    _backend_id = backend_id;
-
-    auto saver_initialize =
-            load_qnn_functionpointers<_pfn_QnnSaver_initialize *>(
-            _loaded_lib_handle[backend_id], "QnnSaver_initialize");
-    if (nullptr != saver_initialize) {
-        error = saver_initialize(saver_config);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to saver_initialize，error %d", QNN_GET_ERROR_CODE(error));
-            return 7;
-        }
-    } else {
-        GGMLQNN_LOG_WARN("saver_initialize is null\n");
-    }
-
-    return 0;
-}
-
-int qnn_instance::unload_backend() {
-    int dlclose_error = 0;
-    for (auto & it : _loaded_lib_handle) {
-        dlclose_error = dlclose(it.second);
-        if (dlclose_error != 0) {
-            GGMLQNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror());
-        }
-    }
-
-    _loaded_lib_handle.clear();
-    _lib_path_to_backend_id.clear();
-    _loaded_backend.clear();
-
-    return 0;
-}
-
-int qnn_instance::load_system() {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-
-#ifdef _WIN32
-    std::string system_lib_path = _lib_path + "QnnSystem.dll";
-#else
-    std::string system_lib_path = _lib_path + "libQnnSystem.so";
-#endif
-    GGMLQNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str());
-
-    _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
-    if (nullptr == _system_lib_handle) {
-        GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
-        //re-try with default path of QNN binary runtime lib
-        _lib_path = "/data/local/tmp/";
-#ifdef _WIN32
-        system_lib_path = _lib_path + "QnnSystem.dll";
-#else
-        system_lib_path = _lib_path + "libQnnSystem.so";
-#endif
-        _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
-        if (nullptr == _system_lib_handle) {
-            GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
-            return 1;
-        }
-    }
-
-    auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym(
-            _system_lib_handle, "QnnSystemInterface_getProviders"));
-    if (nullptr == get_providers) {
-        GGMLQNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror());
-        return 2;
-    }
-
-    uint32_t num_providers = 0;
-    const QnnSystemInterface_t ** provider_list = nullptr;
-    error = get_providers(&provider_list, &num_providers);
-    if (error != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error));
-        return 3;
-    }
-
-    if (num_providers != _required_num_providers) {
-        GGMLQNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers);
-        return 4;
-    }
-
-    if (nullptr == provider_list) {
-        GGMLQNN_LOG_WARN("can not get providers\n");
-        return 5;
-    }
-
-    QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface;
-    bool found_valid_system_interface = false;
-    for (size_t idx = 0; idx < num_providers; idx++) {
-        if (QNN_SYSTEM_API_VERSION_MAJOR ==
-            provider_list[idx]->systemApiVersion.major &&
-            QNN_SYSTEM_API_VERSION_MINOR <=
-            provider_list[idx]->systemApiVersion.minor) {
-            found_valid_system_interface = true;
-            qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME;
-            break;
-        }
-    }
-    if (!found_valid_system_interface) {
-        GGMLQNN_LOG_WARN("unable to find a valid qnn system interface\n");
-        return 6;
-    } else {
-        GGMLQNN_LOG_INFO("find a valid qnn system interface\n");
-    }
-    set_qnn_raw_system_interface(qnn_system_interface);
-
-    _qnn_interface.set_qnn_system_interface(provider_list[0]);
-
-    _qnn_interface.qnn_system_context_create(&_qnn_system_handle);
-    if (nullptr == _qnn_system_handle) {
-        GGMLQNN_LOG_WARN("can not create QNN system contenxt\n");
-    } else {
-        GGMLQNN_LOG_INFO("initialize qnn system successfully\n");
-    }
-
-    return 0;
-}
-
-int qnn_instance::unload_system() {
-    int result = 0;
-
-    if (nullptr == _system_lib_handle) {
-        GGMLQNN_LOG_DEBUG("system lib handle is null\n");
-        return 1;
-    }
-
-    if (nullptr != _qnn_system_handle) {
-        result = _qnn_interface.qnn_system_context_free(_qnn_system_handle);
-        if (result != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to free QNN system context\n");
-        }
-        _qnn_system_handle = nullptr;
-    }
-
-    int dlclose_error = dlclose(_system_lib_handle);
-    if (dlclose_error != 0) {
-        GGMLQNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror());
-        return 2;
-    }
-
-    _system_lib_handle = nullptr;
-
-    return result;
-}
-
-#if GGMLQNN_PRINT_QNN_INTERNAL_LOG
-static void ggml_qnn_logcallback(const char * fmt,
-                                 QnnLog_Level_t level,
-                                 uint64_t timestamp,
-                                 va_list argp) {
-
-    static std::mutex log_mutex;
-    static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN];
-
-    const char * log_level_desc = "";
-    switch (level) {
-        case QNN_LOG_LEVEL_ERROR:
-            log_level_desc = " ERROR ";
-            break;
-        case QNN_LOG_LEVEL_WARN:
-            log_level_desc = "WARNING";
-            break;
-        case QNN_LOG_LEVEL_INFO:
-            log_level_desc = "  INFO ";
-            break;
-        case QNN_LOG_LEVEL_DEBUG:
-            log_level_desc = " DEBUG ";
-            break;
-        case QNN_LOG_LEVEL_VERBOSE:
-            log_level_desc = "VERBOSE";
-            break;
-        case QNN_LOG_LEVEL_MAX:
-            log_level_desc = "UNKNOWN";
-            break;
-    }
-
-    double ms = (double) timestamp / 1000000.0;
-    {
-        std::lock_guard<std::mutex> lock(log_mutex);
-        memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN);
-        vsnprintf(reinterpret_cast<char *const>(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp);
-        GGMLQNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
-    }
-}
-#else
-static void ggml_qnn_logcallback(const char * fmt,
-                                 QnnLog_Level_t level,
-                                 uint64_t timestamp,
-                                 va_list argp) {
-    GGML_UNUSED(fmt);
-    GGML_UNUSED(level);
-    GGML_UNUSED(timestamp);
-    GGML_UNUSED(argp);
-}
-#endif
-
-int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
-    BackendIdType backend_id = QNN_BACKEND_ID_NULL;
-    GGMLQNN_LOG_DEBUG("enter qni_init\n");
-    const std::lock_guard<std::mutex> lock(_init_mutex);
-    if (0 != load_system()) {
-        GGMLQNN_LOG_WARN("can not load QNN system lib, pls check why?\n");
-        return 1;
-    } else {
-        GGMLQNN_LOG_DEBUG("load QNN system lib successfully\n");
-    }
-
-    std::string backend_lib_path = _lib_path + _backend_name;
-    if (0 == _lib_path_to_backend_id.count(backend_lib_path)) {
-        int is_load_ok = load_backend(backend_lib_path, saver_config);
-        if (0 != is_load_ok) {
-            GGMLQNN_LOG_WARN("failed to load QNN backend\n");
-            return 2;
-        }
-    }
-
-    backend_id = _lib_path_to_backend_id[backend_lib_path];
-    if (0 == _loaded_backend.count(backend_id) ||
-        0 == _loaded_lib_handle.count(backend_id)) {
-        GGMLQNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, loaded lib_handle count=%zu\n",
-              backend_lib_path.c_str(),
-              _loaded_backend.count(backend_id),
-              _loaded_lib_handle.count(backend_id));
-        return 3;
-    }
-    _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]);
-#if 1
-    _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle);
-#else
-    _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle);
-#endif
-    if (nullptr == _qnn_log_handle) {
-        GGMLQNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone
-        return 4;
-    } else {
-        GGMLQNN_LOG_DEBUG("initialize qnn log successfully\n");
-    }
-
-    std::vector<const QnnBackend_Config_t *> temp_backend_config;
-    _qnn_interface.qnn_backend_create(_qnn_log_handle,
-                      temp_backend_config.empty() ? nullptr : temp_backend_config.data(),
-                      &_qnn_backend_handle);
-    if (nullptr == _qnn_backend_handle) {
-        GGMLQNN_LOG_WARN("why failed to initialize qnn backend\n");
-        return 5;
-    } else {
-        GGMLQNN_LOG_DEBUG("initialize qnn backend successfully\n");
-    }
-
-    if (nullptr != _qnn_raw_interface.propertyHasCapability) {
-        auto qnnstatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE);
-        if (QNN_PROPERTY_NOT_SUPPORTED == qnnstatus) {
-            GGMLQNN_LOG_WARN("device property is not supported\n");
-        }
-        if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnstatus) {
-            GGMLQNN_LOG_WARN("device property is not known to backend\n");
-        }
-    }
-
-    auto qnnstatus = _qnn_raw_interface.deviceCreate(
-            _qnn_log_handle, nullptr, &_qnn_device_handle);
-    if (QNN_SUCCESS != qnnstatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnstatus) {
-        GGMLQNN_LOG_WARN("failed to create QNN device\n");
-    } else {
-        GGMLQNN_LOG_INFO("create device successfully\n");
-    }
-
-    if (ggml_qnn_profile_level::profile_off != _profile_level) {
-        GGMLQNN_LOG_INFO("profiling turned on; level = %d", _profile_level);
-        if (ggml_qnn_profile_level::profile_basic == _profile_level) {
-            GGMLQNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n");
-            if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
-                    _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) {
-                GGMLQNN_LOG_WARN("unable to create profile handle in the backend\n");
-                return 6;
-            } else {
-                GGMLQNN_LOG_DEBUG("initialize qnn profile successfully\n");
-            }
-        } else if (ggml_qnn_profile_level::profile_detail == _profile_level) {
-            GGMLQNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n");
-            if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
-                    _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) {
-                GGMLQNN_LOG_WARN("unable to create profile handle in the backend\n");
-                return 7;
-            } else {
-                GGMLQNN_LOG_DEBUG("initialize qnn profile successfully\n");
-            }
-        }
-    }
-
-#if defined(__ANDROID__) || defined(__linux__)
-    _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
-#elif defined(_WIN32)
-    _rpc_lib_handle = dlopen("libcdsprpc.dll", RTLD_NOW | RTLD_LOCAL);
-#else
-#error "ggml-qnn only support WoA, Android, Linux"
-#endif
-    if (nullptr == _rpc_lib_handle) {
-        GGMLQNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror());
-        return 8;
-    } else {
-        GGMLQNN_LOG_DEBUG("load rpcmem lib successfully\n");
-        set_rpcmem_initialized(true);
-    }
-    _pfn_rpc_mem_init   = reinterpret_cast<pfn_rpc_mem_init>(dlsym(_rpc_lib_handle, "rpcmem_init"));
-    _pfn_rpc_mem_deinit = reinterpret_cast<pfn_rpc_mem_deinit>(dlsym(_rpc_lib_handle, "rpcmem_deinit"));
-    _pfn_rpc_mem_alloc  = reinterpret_cast<pfn_rpc_mem_alloc>(dlsym(_rpc_lib_handle,"rpcmem_alloc"));
-    _pfn_rpc_mem_free   = reinterpret_cast<pfn_rpc_mem_free>(dlsym(_rpc_lib_handle, "rpcmem_free"));
-    _pfn_rpc_mem_to_fd  = reinterpret_cast<pfn_rpc_mem_to_fd>(dlsym(_rpc_lib_handle,"rpcmem_to_fd"));
-    if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free
-        || nullptr == _pfn_rpc_mem_to_fd) {
-        GGMLQNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror());
-        dlclose(_rpc_lib_handle);
-        return 9;
-    }
-
-    if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy
-        _pfn_rpc_mem_init();
-
-    std::vector<const QnnContext_Config_t *> temp_context_config;
-    _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle,
-                               temp_context_config.empty() ? nullptr : temp_context_config.data(),
-                               &_qnn_context_handle);
-    if (nullptr == _qnn_context_handle) {
-        GGMLQNN_LOG_WARN("why failed to initialize qnn context, error:%s\n", strerror(errno));
-        return 10;
-    } else {
-        GGMLQNN_LOG_DEBUG("initialize qnn context successfully\n");
-    }
-
-    if (_backend_name.find("Htp") != std::variant_npos) {
-        const QnnDevice_PlatformInfo_t * p_info = nullptr;
-        _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
-        GGMLQNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices);
-        QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
-        for (size_t i = 0; i < p_info->v1.numHwDevices; i++) {
-            GGMLQNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId,
-                         infos[i].v1.deviceType, infos[i].v1.numCores);
-            QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
-            QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice;
-            QnnHtpDevice_Arch_t htp_arch = chipinfo.arch;
-            GGMLQNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType,
-                             (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "QNN_HTP_DEVICE_TYPE_ON_CHIP" : "QNN_HTP_DEVICE_TYPE_UNKNOWN");
-            GGMLQNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \
-                             chipinfo.socModel, qnn_get_socmodel_desc(chipinfo.socModel), \
-                             htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize);
-            struct qcom_socinfo * socinfo = qnn_get_socinfo_from_socmodel(chipinfo.socModel);
-            g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {}};
-            if (nullptr != socinfo) {
-                memcpy(g_qnn_mgr[QNN_BACKEND_NPU].socinfo.soc_desc, socinfo->soc_desc, sizeof(socinfo->soc_desc));
-                GGMLQNN_LOG_INFO("soc info:%s", socinfo->soc_desc);
-            } else {
-                memcpy(g_qnn_mgr[QNN_BACKEND_NPU].socinfo.soc_desc, "unknown", 7);
-                GGMLQNN_LOG_INFO("soc info:unknown");
-            }
-        }
-        _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
-
-        probe_device_meminfo();
-
-        if (0 != init_htp_perfinfra()) {
-            GGMLQNN_LOG_WARN("initialize HTP performance failure");
-        }
-        if (0 != set_rpc_polling()) {
-            GGMLQNN_LOG_WARN("set RPC polling failure");
-        }
-        if (0 != set_high_performance_mode()) {
-            GGMLQNN_LOG_WARN("set HTP high performance mode failure");
-        }
-    }
-
-    GGMLQNN_LOG_DEBUG("leave qni_init\n");
-
-    return 0;
-}
-
-int qnn_instance::qnn_finalize() {
-    int ret_status = 0;
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-
-    GGMLQNN_LOG_DEBUG("enter %s\n", __func__);
-    reset_idx();
-
-    free_rpcmem();
-    unregister_rpcmem();
-
-    if (nullptr != _pfn_rpc_mem_deinit)
-        _pfn_rpc_mem_deinit();
-
-    if (dlclose(_rpc_lib_handle) != 0) {
-        GGMLQNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror());
-    } else {
-        GGMLQNN_LOG_DEBUG("succeed to close rpcmem lib\n");
-    }
-
-    if (nullptr != _qnn_context_handle) {
-        error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n",
-                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
-
-        }
-        _qnn_context_handle = nullptr;
-    }
-
-    if (nullptr != _qnn_profile_handle) {
-        error = _qnn_interface.qnn_profile_free(_qnn_profile_handle);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n",
-                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
-
-        }
-        _qnn_profile_handle = nullptr;
-    }
-
-    if (nullptr != _qnn_device_handle) {
-        error = _qnn_interface.qnn_device_free(_qnn_device_handle);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n",
-                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
-
-        }
-        _qnn_device_handle = nullptr;
-    }
-
-    if (nullptr != _qnn_backend_handle) {
-        error = _qnn_interface.qnn_backend_free(_qnn_backend_handle);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n",
-                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
-        }
-        _qnn_backend_handle = nullptr;
-
-    }
-
-    if (nullptr != _qnn_log_handle) {
-        error = _qnn_interface.qnn_log_free(_qnn_log_handle);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n",
-                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
-        }
-        _qnn_log_handle = nullptr;
-    }
-
-    unload_backend();
-
-    unload_system();
-    GGMLQNN_LOG_DEBUG("leave %s\n", __func__);
-
-    return ret_status;
-}
-
-int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb, size_t hvx_threads) {
-    _graph_name = graph_name;
-    _device_id = device;
-
-    GGMLQNN_LOG_DEBUG("[%s][%s]created", ggml_backend_qnn_get_devname(device), graph_name.c_str());
-
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    Qnn_GraphHandle_t graph_handle = nullptr;
-    if (device == QNN_BACKEND_NPU) {
-        QnnHtpGraph_CustomConfig_t hvx_config;
-        hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
-        hvx_config.numHvxThreads = hvx_threads;
-        QnnGraph_Config_t graph_hvx_config;
-        graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-        graph_hvx_config.customConfig = &hvx_config;
-
-        QnnHtpGraph_CustomConfig_t dlbc_config;
-        dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
-        dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
-        dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
-        QnnGraph_Config_t graph_dlbc_config;
-        graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-        graph_dlbc_config.customConfig = &dlbc_config;
-
-        QnnHtpGraph_CustomConfig_t opt_config;
-        opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
-        opt_config.optimizationOption.floatValue = 1; // 1 / 3
-        QnnGraph_Config_t graph_opt_config;
-        graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-        graph_opt_config.customConfig = &opt_config;
-
-        QnnHtpGraph_CustomConfig_t vtcm_config;
-        vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
-        vtcm_config.vtcmSizeInMB = vtcm_size_in_mb;
-        QnnGraph_Config_t graph_vtcm_config;
-        graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-        graph_vtcm_config.customConfig = &vtcm_config;
-
-        const QnnGraph_Config_t * graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config,
-                                                    &graph_opt_config, nullptr};
-        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), graph_configs, &graph_handle);
-    } else {
-        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), nullptr, &graph_handle);
-    }
-
-    if (error != QNN_SUCCESS) {
-        GGMLQNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s",
-                      ggml_backend_qnn_get_devname(device), graph_name.c_str(),
-                      ggmlqnn_get_error_string(error));
-        return error;
-    }
-
-    GGMLQNN_LOG_DEBUG("[%s]create graph %s succeed", ggml_backend_qnn_get_devname(device), graph_name.c_str());
-    _qnn_graph_handle = graph_handle;
-    return QNN_SUCCESS;
-}
-
-int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation,
-                                 const QnnGraph_Config_t ** graph_configs) {
-    int result = 0;
-
-    if (nullptr == graph_name) {
-        GGMLQNN_LOG_WARN("graph name is null\n");
-        return 1;
-    }
-
-    if (!_graph_name.empty()) {
-        GGMLQNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name);
-        return 2;
-    }
-
-    if (!do_node_validation) {
-        GGMLQNN_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n");
-    }
-
-    _graph_name = graph_name;
-    _debug_tensor = debug;
-    _do_node_validations = do_node_validation;
-
-    result = _qnn_raw_interface.graphCreate(_qnn_context_handle,
-                                            graph_name,
-                                            graph_configs,
-                                            &_qnn_graph_handle);
-    if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) {
-        GGMLQNN_LOG_WARN("failed to create graph in qnn context\n");
-        return 3;
-    } else {
-        GGMLQNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle);
-    }
-
-    return 0;
-}
-
-int qnn_instance::finalize_qnn_graph() {
-    if (nullptr != _qnn_graph_handle) {
-        if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle,
-                                             _qnn_profile_handle, nullptr)
-                                             != QNN_GRAPH_NO_ERROR) {
-            GGMLQNN_LOG_WARN("finalizing graph failure\n");
-            return 1;
-        }
-    } else {
-        GGMLQNN_LOG_DEBUG("qnn graph handle is null\n");
-    }
-
-    return 0;
-}
-
-int qnn_instance::init_htp_perfinfra() {
-    QnnDevice_Infrastructure_t device_infra = nullptr;
-    int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra);
-    if (error != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to get qnn device infra\n");
-        return 1;
-    }
-
-    QnnHtpDevice_Infrastructure_t * htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
-    QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra;
-    uint32_t power_configid = 1;
-    uint32_t device_id = 0;
-    uint32_t core_id = 0;
-    htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid);
-    _qnn_htp_perfinfra = htp_perfinfra;
-    _qnn_power_configid = power_configid;
-
-    return 0;
-}
-
-int qnn_instance::set_rpc_polling() {
-    if (_qnn_rpc_pollingtime > 0) {
-        QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingtime;
-        memset(&rpc_pollingtime, 0, sizeof(rpc_pollingtime));
-        rpc_pollingtime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
-        rpc_pollingtime.rpcPollingTimeConfig = _qnn_rpc_pollingtime;
-        const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&rpc_pollingtime, nullptr};
-        if (_qnn_htp_perfinfra) {
-            _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
-        }
-    }
-    return 0;
-}
-
-int qnn_instance::set_high_performance_mode() {
-    if (nullptr == _qnn_htp_perfinfra) {
-        GGMLQNN_LOG_DEBUG("perf intra is null\n");
-        return 1;
-    }
-
-    QnnHtpPerfInfrastructure_PowerConfig_t power_config;
-    memset(&power_config, 0, sizeof(power_config));
-    power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
-    power_config.dcvsV3Config.dcvsEnable = 0;
-    power_config.dcvsV3Config.setDcvsEnable = 1;
-    power_config.dcvsV3Config.contextId = _qnn_power_configid;
-    power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
-    power_config.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False
-    power_config.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False
-    power_config.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False
-    power_config.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable
-    power_config.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False
-    // set Sleep latency parameter
-    uint32_t latencyValue = 40;
-    power_config.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec
-    // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
-    power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
-    power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    // set power config with different performance parameters
-    const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&power_config, nullptr};
-
-    _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
-
-    return 0;
-}
-
-void qnn_instance::probe_device_meminfo() {
-    size_t candidate_size   = 0;
-    uint8_t * rpc_buffer    = nullptr;
-    const int SIZE_IN_MB    = (1 << 20);
-    size_t probe_slots[]    = {1024, 1536, 2048 - 48, 2048};
-    size_t probe_counts     = sizeof(probe_slots) / sizeof(size_t);
-    for (size_t idx = 0; idx < probe_counts; idx++) {
-        rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem_internal(probe_slots[idx] * SIZE_IN_MB, 4));
-        if (nullptr == rpc_buffer) {
-            GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
-            break;
-        } else {
-            candidate_size = probe_slots[idx];
-            free_rpcmem(rpc_buffer);
-            rpc_buffer = nullptr;
-        }
-    }
-    if (candidate_size > _rpcmem_capacity)
-        _rpcmem_capacity = candidate_size;
-
-    free_rpcmem();
-    _rpcmem_usage = 0;
-    GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
-}
-
-uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) {
-    if (nullptr == instance || nullptr == ggml_tensor || nullptr == qnn_tensor) {
-        GGMLQNN_LOG_WARN("invalid params\n");
-        return nullptr;
-    }
-
-    uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(ggml_tensor), 4));
-    if (nullptr == qnn_rpcbuffer) {
-        GGMLQNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
-        return nullptr;
-    } else {
-        GGMLQNN_LOG_DEBUG("alloc rpcmem %p successfully\n", qnn_rpcbuffer);
-    }
-    if (b_copydata)
-        memcpy(qnn_rpcbuffer, ggml_tensor->data, ggml_nbytes(ggml_tensor));
-    instance->register_rpcmem(qnn_rpcbuffer, qnn_tensor);
-    return qnn_rpcbuffer;
-}
-
-void ggmlqnn_print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    //skip sanity check of params
-    if (nullptr != func_name && nullptr != ctx) {
-        GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
-    }
-    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
-                      src0->name,
-                      src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-                      src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
-    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
-                      src1->name,
-                      src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
-                      src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
-    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
-                      dst->name,
-                      dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-                      dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
-    GGMLQNN_LOG_DEBUG("\n");
-}
-
-static void dump_op_info(const struct ggml_tensor * tensor) {
-    //skip sanity check of params
-    const struct ggml_tensor * src0 = tensor->src[0];
-    struct ggml_tensor       * src1 = tensor->src[1];
-    struct ggml_tensor       * dst  = const_cast<ggml_tensor *>(tensor);
-    GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), ggml_type_name(tensor->type));
-    ggmlqnn_print_tensors_info(nullptr, nullptr, src0, src1, dst);
-}
-
-// =================================================================================================
-//  section-6: implementation of ggml-qnn backend
-// =================================================================================================
-//TODO: refine this function as it is a performance hotspot/bottleneck function
-static bool ggml_qnn_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * tensor) {
-    if (tensor->op == GGML_OP_NONE) {
-        return true;
-    }
-    if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE
-        || tensor->op == GGML_OP_TRANSPOSE
-        || tensor->op == GGML_OP_VIEW
-        || tensor->op == GGML_OP_PERMUTE
-        ) {
-        return false;
-    }
-
-    //TODO: add other op here
-    bool supported_op = ((tensor->op == GGML_OP_ADD)
-                         || (tensor->op == GGML_OP_MUL_MAT)
-                         || (tensor->op == GGML_OP_MUL)
-                        );
-    if (!supported_op) {
-        return false;
-    }
-
-    struct ggml_tensor * src0 = tensor->src[0];
-    struct ggml_tensor * src1 = tensor->src[1];
-
-    const int64_t ne00  = tensor->src[0]->ne[0];
-    const int64_t ne01  = tensor->src[0]->ne[1];
-
-    const int64_t ne10  = tensor->src[1]->ne[0];
-    const int64_t ne11  = tensor->src[1]->ne[1];
-
-    const int64_t ne0   = tensor->ne[0];
-    const int64_t ne1   = tensor->ne[1];
-
-    const uint32_t src0_rank = ggml_n_dims(src0);
-    const uint32_t src1_rank = ggml_n_dims(src1);
-    GGML_UNUSED(ne01);
-    GGML_UNUSED(ne10);
-    GGML_UNUSED(ne11);
-    GGML_UNUSED(ne0);
-    GGML_UNUSED(ne1);
-
-    if (tensor->op == GGML_OP_ADD) {
-        //dump_op_info(tensor);
-        if (!ggml_are_same_shape(src0, src1)) {
-            return false;
-        }
-        if (ne00 < 32)
-            return false;
-        return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
-               && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
-    }
-
-    if (tensor->op == GGML_OP_MUL_MAT) {
-        //dump_op_info(tensor);
-        if (src0_rank != src1_rank) // make QNN SDK happy
-            return false;
-        if (src0_rank < 2) // QNN's limitation, make QNN SDK happy
-            return false;
-        if (4 == src0_rank) //TODO: 4D matrix mulmat in CT
-            return false;
-        if ((src1->ne[2] != src0->ne[2]) || (src1->ne[3] != src0->ne[3])) // make QNN SDK happy
-            return false;
-
-        if (ctx->device == QNN_BACKEND_NPU)
-            if (2 == src0_rank)
-                return (src0->type == GGML_TYPE_F32
-                    || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
-                    || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
-                   ) && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
-           else
-                return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
-        else
-            return (src0->type == GGML_TYPE_F32   || ggml_is_quantized(src0->type))
-                    && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
-    }
-
-    if (tensor->op == GGML_OP_MUL) {
-        //dump_op_info(tensor);
-        if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix
-            return false;
-        return  (src0->type == GGML_TYPE_F32)
-                && (src1->type == GGML_TYPE_F32)
-                && (tensor->type == src1->type);
-    }
-
-    return false;
-}
-
-static bool ggml_qnn_compute_forward(ggml_backend_t backend, struct ggml_tensor * dst) {
-    ggmlqnn_op_func_t func                = nullptr;
-    ggml_backend_qnn_context * ctx        = (ggml_backend_qnn_context *)backend->context;
-
-    switch (dst->op) {
-        case GGML_OP_REPEAT:
-            ggml_qnn_repeat(ctx, dst);
-            break;
-        case GGML_OP_GET_ROWS:
-            ggml_qnn_get_rows(ctx, dst);
-            break;
-        case GGML_OP_DUP:
-            ggml_qnn_dup(ctx, dst);
-            break;
-        case GGML_OP_ADD:
-            func = ggml_qnn_general_node;
-            break;
-        case GGML_OP_ACC:
-            ggml_qnn_acc(ctx, dst);
-            break;
-        case GGML_OP_MUL:
-            func = ggml_qnn_general_node;
-            break;
-        case GGML_OP_DIV:
-            ggml_qnn_div(ctx, dst);
-            break;
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(dst)) {
-                case GGML_UNARY_OP_GELU:
-                    break;
-                case GGML_UNARY_OP_SILU:
-                    break;
-                case GGML_UNARY_OP_GELU_QUICK:
-                    break;
-                case GGML_UNARY_OP_TANH:
-                    break;
-                case GGML_UNARY_OP_RELU:
-                    break;
-                case GGML_UNARY_OP_HARDSIGMOID:
-                    break;
-                case GGML_UNARY_OP_HARDSWISH:
-                    break;
-                default:
-                    return false;
-            }
-            break;
-        case GGML_OP_NORM:
-            ggml_qnn_norm(ctx, dst);
-            break;
-        case GGML_OP_GROUP_NORM:
-            ggml_qnn_group_norm(ctx, dst);
-            break;
-        case GGML_OP_CONCAT:
-            ggml_qnn_concat(ctx, dst);
-            break;
-        case GGML_OP_UPSCALE:
-            ggml_qnn_upsample_nearest2d(ctx, dst);
-            break;
-        case GGML_OP_PAD:
-            ggml_qnn_pad(ctx, dst);
-            break;
-        case GGML_OP_ARANGE:
-            ggml_qnn_arange(ctx, dst);
-            break;
-        case GGML_OP_TIMESTEP_EMBEDDING:
-            ggml_qnn_timestep_embedding(ctx, dst);
-            break;
-        case GGML_OP_LEAKY_RELU:
-            ggml_qnn_leaky_relu(ctx, dst);
-            break;
-        case GGML_OP_RMS_NORM:
-            ggml_qnn_rms_norm(ctx, dst);
-            break;
-        case GGML_OP_MUL_MAT:
-            ggml_qnn_mul_mat(ctx, dst);
-            break;
-        case GGML_OP_MUL_MAT_ID:
-            return false;
-        case GGML_OP_SCALE:
-            ggml_qnn_scale(ctx, dst);
-            break;
-        case GGML_OP_SQR:
-            ggml_qnn_sqr(ctx, dst);
-            break;
-        case GGML_OP_CLAMP:
-            ggml_qnn_clamp(ctx, dst);
-            break;
-        case GGML_OP_CPY:
-            ggml_qnn_cpy(ctx, dst);
-            break;
-        case GGML_OP_CONT:
-            ggml_qnn_dup(ctx, dst);
-            break;
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            break;
-        case GGML_OP_DIAG_MASK_INF:
-            ggml_qnn_diag_mask(ctx, dst, -INFINITY);
-            break;
-        case GGML_OP_SOFT_MAX:
-            ggml_qnn_softmax(ctx, dst);
-            break;
-        case GGML_OP_ROPE:
-            ggml_qnn_rope(ctx, dst);
-            break;
-        case GGML_OP_IM2COL:
-            ggml_qnn_im2col(ctx, dst);
-            break;
-        case GGML_OP_POOL_2D:
-            ggml_qnn_pool2d(ctx, dst);
-            break;
-        case GGML_OP_SUM_ROWS:
-            ggml_qnn_sum_rows(ctx, dst);
-            break;
-        case GGML_OP_ARGSORT:
-            ggml_qnn_argsort(ctx, dst);
-            break;
-        default:
-            return false;
-    }
-
-    if (nullptr != func)
-        func(ctx, dst);
-
-    return true;
-}
-
-struct ggml_backend_qnn_buffer_context {
-    ~ggml_backend_qnn_buffer_context() {
-        if (buffer) {
-            free(buffer);
-        }
-
-        for (auto * sub_buffer : sub_buffers) {
-            free(sub_buffer);
-        }
-
-        for (auto * qnn_tensor : qnn_tensors) {
-            free_qnn_tensor(qnn_tensor);
-        }
-
-        sub_buffers.clear();
-        qnn_tensors.clear();
-    }
-    void * buffer       = nullptr;
-
-    struct ggml_backend_qnn_context * backend_ctx = nullptr;
-
-    size_t buffer_size  = 0;
-    std::vector<void *> sub_buffers;
-    std::vector<Qnn_Tensor_t *> qnn_tensors;
-};
-
-static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
-    delete ctx;
-}
-
-static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) {
-    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
-    return ctx->buffer;
-}
-
-static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
-    GGML_UNUSED(tensor);
-    GGML_UNUSED(ctx);
-    return;
-}
-
-static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer,
-                                               ggml_tensor * tensor, const void * data,
-                                               size_t offset, size_t size) {
-    GGML_UNUSED(buffer);
-
-    memcpy((char *)tensor->data + offset, data, size);
-}
-
-static void ggml_backend_qnn_buffer_memset_tensor(ggml_backend_buffer_t buffer,
-                                                  struct ggml_tensor * tensor,
-                                                  uint8_t value, size_t offset, size_t size) {
-    GGML_UNUSED(buffer);
-    memset((char *)tensor->data + offset, value, size);
-}
-
-static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer,
-                                               const ggml_tensor * tensor,
-                                               void * data, size_t offset, size_t size) {
-    GGML_UNUSED(buffer);
-    memcpy(data, (const char *)tensor->data + offset, size);
-}
-
-static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
-                                               const struct ggml_tensor * src,
-                                               struct ggml_tensor * dst) {
-    GGML_UNUSED(buffer);
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        memcpy(dst->data, src->data, ggml_nbytes(src));
-        return true;
-    }
-
-    return false;
-}
-
-static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
-    memset(ctx->buffer, value, ctx->buffer_size);
-}
-
-static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = {
-        /* .free_buffer     = */ ggml_backend_qnn_buffer_free_buffer,
-        /* .get_base        = */ ggml_backend_qnn_buffer_get_base,
-        /* .init_tensor     = */ ggml_backend_qnn_buffer_init_tensor,
-        /* .memset_tensor   = */ ggml_backend_qnn_buffer_memset_tensor,
-        /* .set_tensor      = */ ggml_backend_qnn_buffer_set_tensor,
-        /* .get_tensor      = */ ggml_backend_qnn_buffer_get_tensor,
-        /* .cpy_tensor      = */ ggml_backend_qnn_buffer_cpy_tensor,
-        /* .clear           = */ ggml_backend_qnn_buffer_clear,
-        /* .reset           = */ nullptr,
-};
-
-static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
-    return "qnn-buffer";
-}
-
-static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(
-                                  ggml_backend_buffer_type_t buft, size_t size) {
-    ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context;
-
-#if defined(__ANDROID__) || defined(__linux__)
-    size_t size_page = sysconf(_SC_PAGESIZE);
-#elif defined(_WIN32)
-    SYSTEM_INFO systeminfo;
-    GetSystemInfo(&systeminfo);
-    size_t size_page = systeminfo.dwPageSize;
-#endif
-    size_t size_aligned = size;
-    if ((size_aligned % size_page) != 0) {
-        size_aligned += (size_page - (size_aligned % size_page));
-    }
-    ctx->buffer         = ggmlqnn_host_malloc(size_aligned);
-    ctx->buffer_size    = size_aligned;
-    if (nullptr == ctx->buffer) {
-        GGMLQNN_LOG_WARN("%s: failed to allocate %.2f MiB\n", __func__, size / (1 << 20));
-        return nullptr;
-    }
-
-    return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size);
-}
-
-static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
-    return 32;
-}
-
-//TODO:not used currently
-static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
-
-    return (2 * (1 << 20));
-}
-
-static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
-    return true;
-}
-
-static const char * ggml_backend_qnn_name(ggml_backend_t backend) {
-    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context;
-    return g_qnn_mgr[ctx->device].name;
-}
-
-static void ggml_backend_qnn_free(ggml_backend_t backend) {
-    GGMLQNN_LOG_DEBUG("enter %s", __func__ );
-    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context;
-    GGMLQNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name);
-
-    qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance;
-    if (instance != nullptr) {
-        std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector<Qnn_Tensor_t*>>>::iterator graph_it;
-
-        for (graph_it = instance->_qnn_graph_map.begin();
-             graph_it != instance->_qnn_graph_map.end(); graph_it++) {
-            auto & graph_item = graph_it->second;
-            Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item);
-            qnn_tensors_t &  tensors = std::get<1>(graph_item);
-            for (auto tensor_it = tensors.begin(); tensor_it != tensors.end(); ++tensor_it) {
-                free_qnn_tensor(*tensor_it);
-            }
-            GGML_UNUSED(graph_handle);
-            GGMLQNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str());
-        }
-        instance->_qnn_graph_map.clear();
-
-        instance->qnn_finalize();
-        delete instance;
-        g_qnn_mgr[ctx->device].instance = nullptr;
-    }
-
-    if (g_qnn_mgr[ctx->device].backend != nullptr) {
-        delete backend;
-        g_qnn_mgr[ctx->device].backend = nullptr;
-    }
-    GGMLQNN_LOG_DEBUG("leave %s", __func__ );
-}
-
-static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    enum ggml_status result         = GGML_STATUS_SUCCESS;
-    ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *) backend->context;
-    GGML_UNUSED(ctx);
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
-        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE
-        || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW
-        || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
-            continue;
-        }
-        bool ok = ggml_qnn_compute_forward(backend, node);
-        if (!ok) {
-            GGMLQNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n",
-                              __func__, node->name, ggml_op_name(node->op));
-        }
-    }
-
-    return result;
-}
-
-static const char * ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) {
-    struct ggml_backend_qnn_context *ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
-    if (nullptr == ctx) {
-        GGMLQNN_LOG_ERROR("pls check why ctx is null");
-        return "unknown";
-    }
-    return ctx->name;
-}
-
-static const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) {
-    struct ggml_backend_qnn_context * ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
-    static char qnn_device_desc[256];
-    if (nullptr == ctx) {
-        GGMLQNN_LOG_ERROR("pls check why ctx is null");
-        return "unknown";
-    }
-    if (0 == strncmp(ctx->name, "qnn-npu", 7)) {
-        const char * soc_info = qnn_get_socmodel_desc(ctx->socinfo.soc_model);
-        const char * htp_arch = qnn_get_htparch_desc(ctx->socinfo.htp_arch);
-        std::string dev_desc = std::string(ctx->desc)
-                + std::string(soc_info) + "_" + std::string(htp_arch)
-                + "," + std::string(ctx->socinfo.soc_desc);
-        memset(qnn_device_desc, 0, 256);
-        memcpy(qnn_device_desc, dev_desc.c_str(), strlen(dev_desc.c_str()));
-        return qnn_device_desc;
-    } else {
-        return ctx->desc;
-    }
-}
-
-static void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    struct ggml_backend_qnn_context * ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
-    if ((nullptr == ctx) || (ctx->device > QNN_BACKEND_GGML)) {
-        GGMLQNN_LOG_ERROR("pls check params");
-        *free = 0;
-        *total = 0;
-    }
-
-    if (QNN_BACKEND_CPU == ctx->device || QNN_BACKEND_GGML == ctx->device) {
-        *total = get_system_total_memory_in_bytes();
-        *free = get_system_free_memory_in_bytes();
-    } else if (QNN_BACKEND_GPU == ctx->device) {
-        //TODO: probe GPU info in Qualcomm Adreno GPU
-        *total = get_system_total_memory_in_bytes();
-        *free = get_system_free_memory_in_bytes();
-    } else if (QNN_BACKEND_NPU == ctx->device) {
-        size_t rpc_ion_memsize = ctx->instance->get_rpcmem_capacity();
-        size_t rpc_ion_usage = ctx->instance->get_rpcmem_usage();
-        GGMLQNN_LOG_DEBUG("rpc memsize %d", rpc_ion_memsize);
-        GGMLQNN_LOG_DEBUG("rpc usage %d", rpc_ion_usage);
-        *total = rpc_ion_memsize * (1 << 20);
-        *free = (rpc_ion_memsize - rpc_ion_usage) * (1 << 20);
-    }
-}
-
-static enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) {
-    GGML_UNUSED(dev);
-    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
-}
-
-static void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev,
-                                              struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_qnn_device_get_name(dev);
-    props->description = ggml_backend_qnn_device_get_description(dev);
-    props->type        = ggml_backend_qnn_device_get_type(dev);
-    ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total);
-    props->caps = {
-            /* .async                 = */ false,
-            /* .host_buffer           = */ false,
-            /* .buffer_from_host_ptr  = */ true,
-            /* .events                = */ false,
-    };
-}
-
-static ggml_backend_t ggml_backend_qnn_device_init_backend(ggml_backend_dev_t dev, const char * params) {
-    GGML_UNUSED(dev);
-    if (nullptr == params) {
-        params = 0;
-    }
-    ggml_backend_t qnn_backend = ggml_backend_qnn_init((int) (intptr_t) params,
-                                                       "/data/local/tmp/");
-
-    return qnn_backend;
-
-}
-
-static ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) {
-    if (device_index >= GGML_QNN_MAX_DEVICES) {
-        GGMLQNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n",
-                      device_index, GGML_QNN_MAX_DEVICES - 1);
-        return nullptr;
-    }
-
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_qnn = {
-            /* .iface   = */ {
-                                     /* .get_name         = */ ggml_backend_qnn_buffer_type_name,
-                                     /* .alloc_buffer     = */ ggml_backend_qnn_buffer_type_alloc_buffer,
-                                     /* .get_alignment    = */ ggml_backend_qnn_buffer_type_get_alignment,
-                                     /* .get_max_size     = */ ggml_backend_qnn_buffer_type_get_max_size,
-                                     /* .get_alloc_size   = */ nullptr,// defaults to ggml_nbytes
-                                     /* .is_host          = */ ggml_backend_qnn_buffer_is_host
-                             },
-            /* .device  = */ nullptr,
-            /* .context = */ nullptr,
-    };
-
-    return &ggml_backend_buffer_type_qnn;
-}
-
-static ggml_backend_buffer_type_t ggml_backend_qnn_device_get_buffer_type(ggml_backend_dev_t dev) {
-    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) dev->context;
-    return ggml_backend_qnn_buffer_type(ctx->device);
-}
-
-static ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_host_ptr(ggml_backend_dev_t dev,
-                                                void * ptr, size_t size, size_t max_tensor_size) {
-    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
-
-    GGML_UNUSED(dev);
-    GGML_UNUSED(max_tensor_size);
-}
-
-static bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) dev->context;
-    return (ggml_qnn_can_handle_op(ctx,op));
-}
-
-static bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(dev);
-    return ggml_backend_buft_is_host(buft);
-}
-
-static struct ggml_backend_device_i ggml_backend_qnn_device_interface = {
-        /* .get_name             = */ ggml_backend_qnn_device_get_name,
-        /* .get_description      = */ ggml_backend_qnn_device_get_description,
-        /* .get_memory           = */ ggml_backend_qnn_device_get_memory,
-        /* .get_type             = */ ggml_backend_qnn_device_get_type,
-        /* .get_props            = */ ggml_backend_qnn_device_get_props,
-        /* .init_backend         = */ ggml_backend_qnn_device_init_backend,
-        /* .get_buffer_type      = */ ggml_backend_qnn_device_get_buffer_type,
-        /* .get_host_buffer_type = */ nullptr,
-        /* .buffer_from_host_ptr = */ ggml_backend_qnn_device_buffer_from_host_ptr,
-        /* .supports_op          = */ ggml_backend_qnn_device_supports_op,
-        /* .supports_buft        = */ ggml_backend_qnn_device_supports_buft,
-        /* .offload_op           = */ nullptr,
-        /* .event_new            = */ nullptr,
-        /* .event_free           = */ nullptr,
-        /* .event_synchronize    = */ nullptr,
-};
-
-static ggml_backend_i ggml_backend_qnn_interface = {
-        /* .get_name                = */ ggml_backend_qnn_name,
-        /* .free                    = */ ggml_backend_qnn_free,
-        /* .set_tensor_async        = */ nullptr,
-        /* .get_tensor_async        = */ nullptr,
-        /* .cpy_tensor_async        = */ nullptr,
-        /* .synchronize             = */ nullptr,
-        /* .graph_plan_create       = */ nullptr,
-        /* .graph_plan_free         = */ nullptr,
-        /* .graph_plan_update       = */ nullptr,
-        /* .graph_plan_compute      = */ nullptr,
-        /* .graph_compute           = */ ggml_backend_qnn_graph_compute,
-        /* .event_record            = */ nullptr,
-        /* .event_wait              = */ nullptr,
-};
-
-//FIXME: this guid is not make sense
-static ggml_guid_t ggml_backend_qnn_guid() {
-    static ggml_guid guid = {
-            0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81,
-            0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09
-    };
-    return &guid;
-}
-
-bool ggml_backend_is_qnn(ggml_backend_t backend) {
-    return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid());
-}
-
-void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) {
-    GGML_ASSERT(ggml_backend_is_qnn(backend));
-
-    struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context;
-    ctx->threads = n_threads;
-}
-
-int ggml_backend_qnn_get_device_count() {
-    return GGML_QNN_MAX_DEVICES;
-}
-
-struct ggml_backend_qnn_reg_context {
-    std::vector<ggml_backend_dev_t> devices;
-};
-
-static const char * ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) {
-    GGML_UNUSED(reg);
-    return "ggml-qnn";
-}
-
-static size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) {
-    GGML_UNUSED(reg);
-    return GGML_QNN_MAX_DEVICES;
-}
-
-static ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    GGML_UNUSED(reg);
-    GGML_UNUSED(index);
-
-    GGMLQNN_LOG_DEBUG("index %d", index);
-    ggml_backend_qnn_reg_context * ctx = (ggml_backend_qnn_reg_context *)reg->context;
-    GGML_ASSERT(index < ctx->devices.size());
-    return ctx->devices[index];
-}
-
-static void * ggml_backend_qnn_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    GGML_UNUSED(reg);
-
-    if (nullptr == name)
-        return nullptr;
-
-    const char * slot_name =  "ggml_backend_set_n_threads";
-    //avoid buffer attack rather than strcmp
-    if (0 == std::memcmp(name, slot_name, strlen(slot_name))) {
-        return (void *)ggml_backend_qnn_set_n_threads;
-    }
-    return nullptr;
-}
-
-static const ggml_backend_reg_i ggml_backend_qnn_reg_interface = {
-        /* .get_name          = */ ggml_backend_qnn_reg_get_name,
-        /* .get_device_count  = */ ggml_backend_qnn_reg_get_device_count,
-        /* .get_device        = */ ggml_backend_qnn_reg_get_device,
-        /* .get_proc_address  = */ ggml_backend_qnn_reg_get_proc_address,
-};
-
-ggml_backend_reg_t ggml_backend_qnn_reg() {
-    static ggml_backend_reg reg;
-    static bool initialized = false;
-    GGMLQNN_LOG_DEBUG("enter ggml_backend_qnn_reg");
-    {
-        static std::mutex mutex;
-        std::lock_guard<std::mutex> lock(mutex);
-        if (!initialized) {
-            ggml_backend_qnn_reg_context * ctx = new ggml_backend_qnn_reg_context;
-
-            for (int i = 0; i < ggml_backend_qnn_get_device_count(); i++) {
-                ggml_backend_dev_t dev = new ggml_backend_device {
-                        /* .iface       = */ ggml_backend_qnn_device_interface,
-                        /* .reg         = */ &reg,
-                        /* .context     = */ &g_qnn_mgr[i]
-                };
-                ctx->devices.push_back(dev);
-            }
-
-            reg = ggml_backend_reg {
-                    /* .api_version = */ GGML_BACKEND_API_VERSION,
-                    /* .iface       = */ ggml_backend_qnn_reg_interface,
-                    /* .context     = */ ctx
-            };
-        }
-
-        initialized = true;
-    }
-    GGMLQNN_LOG_DEBUG("leave ggml_backend_qnn_reg");
-
-    return &reg;
-}
-
-/**
- *
- * @param device            0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU
- * @param qnn_lib_path      QNN binrary runtime library path, such as "/data/local/tmp/" on Android or specified in JNI layer
- * @return
- */
-ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
-    int result = 0;
-
-    if (nullptr == qnn_lib_path)
-        return nullptr;
-
-    GGMLQNN_LOG_DEBUG("device %d", device);
-    GGMLQNN_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path);
-    if (device >= GGML_QNN_MAX_DEVICES) {
-        GGMLQNN_LOG_ERROR("invalid device %d", device);
-        return nullptr;
-    }
-
-    if (nullptr != g_qnn_mgr[device].backend) {
-        GGMLQNN_LOG_WARN("qnn backend %d(%s) already loaded", device, ggml_backend_qnn_get_devname(device));
-        return g_qnn_mgr[device].backend;
-    }
-
-#if defined(__ANDROID__)
-    std::string path = qnn_lib_path;
-    if (QNN_BACKEND_NPU == device) {
-        if (0 == setenv("LD_LIBRARY_PATH",
-                        (path +
-                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
-                        1)) {
-            GGMLQNN_LOG_INFO("QNN NPU backend setenv successfully");
-        } else {
-            GGMLQNN_LOG_ERROR("QNN NPU backend setenv failure");
-        }
-        if (0 == setenv("ADSP_LIBRARY_PATH",
-                        (path +
-                         ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(),
-                        1)) {
-            GGMLQNN_LOG_INFO("QNN NPU backend setenv successfully");
-        } else {
-            GGMLQNN_LOG_ERROR("QNN NPU backend setenv failure");
-        }
-    } else {
-        if (0 == setenv("LD_LIBRARY_PATH",
-                        (path +
-                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
-                        1)) {
-            GGMLQNN_LOG_INFO("%s backend setenv successfully\n", ggml_backend_qnn_get_devname(device));
-        } else {
-            GGMLQNN_LOG_ERROR("%s backend setenv failure\n", ggml_backend_qnn_get_devname(device));
-        }
-    }
-#endif
-
-    qnn_instance * instance = nullptr;
-    instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, "");
-    result = instance->qnn_init(nullptr);
-    if (0 != result) {
-        GGMLQNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", ggml_backend_qnn_get_devname(device));
-        delete instance;
-        return nullptr;
-    }
-    qnn_interface qnn_interface                             = instance->get_qnn_interface();
-    if (!qnn_interface.is_loaded()) {
-        GGMLQNN_LOG_WARN("qnn subsystem failure\n");
-        delete instance;
-        return nullptr;
-    }
-
-    std::string device_name = ggml_backend_qnn_get_devname(device);
-    GGMLQNN_LOG_INFO("qnn device name %s", device_name.c_str());
-    g_qnn_mgr[device].instance                  = instance;
-    g_qnn_mgr[device].raw_interface             = instance->get_qnn_raw_interface();
-    g_qnn_mgr[device].raw_system_interface      = instance->get_qnn_raw_system_interface();
-
-    ggml_backend_t qnn_backend = new ggml_backend{
-            /* .guid      = */ ggml_backend_qnn_guid(),
-            /* .iface     = */ ggml_backend_qnn_interface,
-            /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_qnn_reg(), device),
-            /* .context   = */ &g_qnn_mgr[device]
-    };
-    g_qnn_mgr[device].backend   = qnn_backend;
-
-    return qnn_backend;
-}
-
-GGML_BACKEND_DL_IMPL(ggml_backend_qnn_reg)

From de55df2cde84a9575e18085c09ab19d99552673e Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 18 Feb 2025 17:35:04 +0800
Subject: [PATCH 075/200] ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 3994 ++++++++++++++++++++++++++++++++
 1 file changed, 3994 insertions(+)
 create mode 100644 ggml/src/ggml-qnn/ggml-qnn.cpp

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
new file mode 100644
index 0000000000000..6f2949333908e
--- /dev/null
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -0,0 +1,3994 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Qualcomm QNN SDK and reference tech guides could be found at:
+ * https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+ * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
+ *
+ * the implementation of ggml-qnn backend has six sections:
+ * section-1 does forward/external declaration,
+ * section-2 defines ggml-qnn internal log function
+ * section-3 does general helper macro / data structure / function
+ * section-4 does QNN helper macro / data structure / function
+ * section-5 does ggml-qnn backend helper macro / data structure / function / class
+ * section-6 does implementation of ggml-qnn backend according to ggml's backend subsystem
+ *
+ * currently only provide GGML_OP_ADD's QNN backend implementation:
+ *    - GGML_OP_ADD: this is skeleton, can expand other ggml ops according to expertise
+ *
+ * of course, can porting ggml-qnn to Windows on ARM as need.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include <inttypes.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/sysinfo.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <map>
+#include <set>
+#include <tuple>
+#include <queue>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <chrono>
+#include <memory>
+#include <regex>
+#include <random>
+#include <functional>
+#include <unordered_map>
+#include <condition_variable>
+#include <cassert>
+#include <unordered_set>
+#include <utility>
+#include <stdatomic.h>
+#if (defined __ANDROID__) || (defined ANDROID)
+#include "android/log.h"
+#endif
+
+#include "QnnTypes.h"
+#include "QnnCommon.h"
+#include "QnnContext.h"
+#include "QnnBackend.h"
+#include "QnnGraph.h"
+#include "QnnProperty.h"
+#include "QnnTensor.h"
+#include "QnnInterface.h"
+#include "Saver/QnnSaver.h"
+#include "System/QnnSystemInterface.h"
+#include "HTP/QnnHtpDevice.h"
+#include "HTP/QnnHtpGraph.h"
+
+#include "ggml-qnn.h"
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+
+// =================================================================================================
+//  section-1: forward/external declaration
+// =================================================================================================
+class qnn_instance;
+struct ggml_backend_qnn_context;
+static int free_qnn_tensor(Qnn_Tensor_t * tensor);
+static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
+
+// =================================================================================================
+//  section-2: ggml-qnn internal troubleshooting function
+// =================================================================================================
+#define GGMLQNN_DEBUG                           1  // for troubleshooting QNN backend
+#define GGML_QNN_LOGBUF_LEN                     4096
+#define ENABLE_QNNBACKEND_PERF                  1  // enable/disable op's perf info
+#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
+#define GGMLQNN_PRINT_OP_ADD_LOG                1  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
+#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            1
+
+#define GGMLQNN_LOG_ERROR(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG,  __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_WARN(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_INFO(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+
+#if GGMLQNN_DEBUG
+#define GGMLQNN_LOG_DEBUG(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+#define GGMLQNN_LOG_DEBUG(...)
+#endif
+static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
+    static std::mutex ggmlqnn_log_internal_mutex;
+    static char s_ggmlqnn_log_internal_buf[GGML_QNN_LOGBUF_LEN];
+
+    {
+        std::lock_guard<std::mutex> lock(ggmlqnn_log_internal_mutex);
+        va_list args;
+        va_start(args, format);
+        int len_prefix = snprintf(s_ggmlqnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line);
+        int len = vsnprintf(s_ggmlqnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args);
+        if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) {
+#if (defined __ANDROID__) || (defined ANDROID)
+            //for Android application(standard APP or command line tool)
+            __android_log_print(ANDROID_LOG_INFO, "ggml-qnn", "%s\n", s_ggmlqnn_log_internal_buf);
+#endif
+#if (defined __ANDROID__) || (defined ANDROID)
+            //do nothing when running on Snapdragon based Android device
+#else
+            //for Snapdragon based WoA(Windows on ARM) device
+            printf("%s\n", s_ggmlqnn_log_internal_buf);
+#endif
+        }
+        va_end(args);
+    }
+}
+
+// =================================================================================================
+//  section-3: general helper macro / data structure / function
+// =================================================================================================
+#define DISABLE_COPY(class_name)                \
+    class_name(const class_name &) = delete;    \
+    void operator=(const class_name &) = delete
+
+#define DISABLE_MOVE(class_name)                \
+    class_name(class_name &&) = delete;         \
+    void operator=(class_name &&) = delete
+
+#define GGMLQNN_MEM_ADD(alignment)              (sizeof (size_t) + alignment)
+#define GGMLQNN_MEM_MASK(alignment)             ((uintptr_t)alignment - 1)
+
+static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
+    return offset % alignment == 0 ? offset
+                                   : offset +
+                                     (static_cast<intptr_t>(alignment) -
+                                      offset % static_cast<intptr_t>(alignment));
+}
+
+static void * ggmlqnn_mallocz_aligned(size_t size, size_t alignment) {
+    uint8_t * buffer = NULL;
+    size_t * sp = NULL;
+    buffer = static_cast<uint8_t *>(calloc(1, size + GGMLQNN_MEM_ADD(alignment)));
+    if (!buffer)
+        return NULL;
+    sp = (size_t *)buffer;
+    *sp = size;
+    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
+    buffer[-1] = buffer - (uint8_t *)sp;
+    return buffer;
+}
+
+static void * ggmlqnn_malloc_aligned(size_t size, size_t alignment) {
+    uint8_t * buffer = NULL;
+    size_t * sp = NULL;
+    buffer = static_cast<uint8_t *>(malloc(size + GGMLQNN_MEM_ADD(alignment)));
+    if (!buffer)
+        return NULL;
+    sp = (size_t *)buffer;
+    *sp = size;
+    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
+    buffer[-1] = buffer - (uint8_t *)sp;
+    return buffer;
+}
+
+static void ggmqnn_free_aligned(void * ptr) {
+    uint8_t * old = (uint8_t *)ptr;
+    if (!old)
+        return;
+    old -= old[-1];
+    free(old);
+}
+
+static size_t get_system_total_memory_in_bytes() {
+    struct sysinfo info = {};
+    if (sysinfo(&info) == 0) {
+        return (info.totalram + info.totalswap) * info.mem_unit;
+    }
+
+    auto pages = (size_t)sysconf(_SC_PHYS_PAGES);
+    auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
+
+    return pages * page_size;
+}
+
+static size_t get_system_free_memory_in_bytes() {
+    struct sysinfo info = {};
+    if (sysinfo(&info) == 0) {
+        return (info.freeram + info.freeswap) * info.mem_unit;
+    }
+
+    auto avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES);
+    auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
+
+    return avail_pages * page_size;
+}
+
+static size_t ggmlqnn_memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) {
+    if (!dst || !src || !dst_size || !copy_size)
+        return 0;
+
+    size_t min_size = dst_size < copy_size ? dst_size : copy_size;
+
+    memcpy(dst, src, min_size);
+
+    return min_size;
+}
+
+static char * ggmlqnn_strndup(const char * source, size_t maxlen) {
+    return ::strndup(source, maxlen);
+}
+
+static void * ggmlqnn_host_malloc(size_t n) {
+    void * data = NULL;
+    int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
+    if (result != 0) {
+        GGMLQNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__);
+        return NULL;
+    }
+
+    return data;
+}
+
+// =================================================================================================
+//  section-4: QNN helper macro / data structure / function
+// =================================================================================================
+#define VALIDATE(value, status)                         \
+  do {                                                  \
+    status = value;                                     \
+    if (status != QNN_SUCCESS) {                        \
+      GGMLQNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value);       \
+      return status;                                    \
+    }                                                   \
+  } while (0)
+
+#define CHECK_QNN_API(error)                            \
+    do {                                                \
+        if (QNN_SUCCESS != (error)) {                   \
+            GGMLQNN_LOG_INFO("error = %d\n", (error));  \
+        }                                               \
+    } while (0)
+
+#define VALIDATE_TENSOR_VERSION(tensor, err)            VALIDATE(validate_tensor_version(tensor), err)
+
+#define VALIDATE_OP_CONFIG_VERSION(op, err)             VALIDATE(validate_op_config_version(op), err)
+
+#define QNN_VER_PTR(x)                                  (&((x).v1))
+#define QNN_OP_CFG_VALID(op_config)                      ((op_config).version == QNN_OPCONFIG_VERSION_1)
+
+#define QNN_OP_CFG_GET_NAME(op_config)                   get_qnn_oponfig_name(op_config)
+#define QNN_OP_CFG_GET_PACKAGE_NAME(op_config)           get_qnn_op_config_packagename(op_config)
+#define QNN_OP_CFG_GET_TYPE_NAME(op_config)              get_qnn_op_config_typename(op_config)
+#define QNN_OP_CFG_GET_NUM_PARAMS(op_config)             get_qnn_op_config_numparams(op_config)
+#define QNN_OP_CFG_GET_PARAMS(op_config)                 get_qnn_op_config_params(op_config)
+#define QNN_OP_CFG_GET_NUM_INPUTS(op_config)             get_qnn_op_config_numinputs(op_config)
+#define QNN_OP_CFG_GET_INPUTS(op_config)                 get_qnn_op_config_inputs(op_config)
+#define QNN_OP_CFG_GET_NUM_OUTPUTS(op_config)            get_qnn_op_config_numoutputs(op_config)
+#define QNN_OP_CFG_GET_OUTPUTS(op_config)                get_qnn_op_config_outputs(op_config)
+
+#define QNN_OP_CFG_SET_NAME(op_config, value)            set_qnn_op_config_name(op_config, value)
+#define QNN_OP_CFG_SET_PACKAGE_NAME(op_config, value)    set_qnn_op_config_packagename(op_config, value)
+#define QNN_OP_CFG_SET_TYPE_NAME(op_config, value)       set_qnn_op_config_typename(op_config, value)
+
+#define QNN_OP_CFG_SET_PARAMS(op_config, num_of_params, params) \
+  set_qnn_op_config_params(op_config, num_of_params, params)
+
+#define QNN_OP_CFG_SET_INPUTS(op_config, num_of_inputs, inputTensors) \
+  set_qnn_op_config_inputs(op_config, num_of_inputs, inputTensors)
+
+#define QNN_OP_CFG_SET_OUTPUTS(op_config, num_of_outputs, output_tensors) \
+  set_qnn_op_config_outputs(op_config, num_of_outputs, output_tensors)
+
+#define QNN_TENSOR_GET_ID(tensor)                       get_qnn_tensorid(tensor)
+#define QNN_TENSOR_GET_NAME(tensor)                     get_qnn_tensorname(tensor)
+#define QNN_TENSOR_GET_TYPE(tensor)                     get_qnn_tensortype(tensor)
+#define QNN_TENSOR_GET_DATA_FORMAT(tensor)              get_qnn_tensor_dataformat(tensor)
+#define QNN_TENSOR_GET_DATA_TYPE(tensor)                get_qnn_tensor_datatype(tensor)
+#define QNN_TENSOR_GET_QUANT_PARAMS(tensor)             get_qnn_tensor_quantparams(tensor)
+#define QNN_TENSOR_GET_RANK(tensor)                     get_qnn_tensor_rank(tensor)
+#define QNN_TENSOR_GET_DIMENSIONS(tensor)               get_qnn_tensor_dimensions(tensor)
+#define QNN_TENSOR_GET_MEM_TYPE(tensor)                 get_qnn_tensor_memtype(tensor)
+#define QNN_TENSOR_GET_CLIENT_BUF(tensor)               get_qnn_tensor_clientbuf(tensor)
+#define QNN_TENSOR_GET_MEM_HANDLE(tensor)               get_qnn_tensor_memhandle(tensor)
+
+#define QNN_TENSOR_SET_ID(tensor, value)                set_qnn_tensor_id(tensor, value)
+#define QNN_TENSOR_SET_NAME(tensor, value)              set_qnn_tensor_name(tensor, value)
+#define QNN_TENSOR_SET_TYPE(tensor, value)              set_qnn_tensor_type(tensor, value)
+#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value)       set_qnn_tensor_dataformat(tensor, value)
+#define QNN_TENSOR_SET_DATA_TYPE(tensor, value)         set_qnn_tensor_datatype(tensor, value)
+#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value)      set_qnn_tensor_quantparams(tensor, value)
+#define QNN_TENSOR_SET_RANK(tensor, value)              set_qnn_tensor_rank(tensor, value)
+#define QNN_TENSOR_SET_DIMENSIONS(tensor, value)        set_qnn_tensor_dimensions(tensor, value)
+#define QNN_TENSOR_SET_MEM_TYPE(tensor, value)          set_qnn_tensor_memtype(tensor, value)
+#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value)        set_qnn_tensor_clientbuf(tensor, value)
+#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value)        set_qnn_tensor_memhandle(tensor, value)
+
+static inline int validate_tensor_version(Qnn_Tensor_t tensor) {
+    if (tensor.version != QNN_TENSOR_VERSION_1) {
+        GGMLQNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n",
+              tensor.v1.name,
+              tensor.version);
+        return 1;
+    }
+    return 0;
+}
+
+[[maybe_unused]] static inline int validate_op_config_version(Qnn_OpConfig_t op_config) {
+    if (op_config.version != QNN_OPCONFIG_VERSION_1) {
+        GGMLQNN_LOG_WARN("validate_op_config_version() op %s, got unsupported version %d\n",
+              op_config.v1.name,
+              op_config.version);
+        return 1;
+    }
+    return 0;
+}
+
+static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.name;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_oponfig_name(*op_config);
+}
+
+static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.packageName;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_packagename(*op_config);
+}
+
+static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.typeName;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_typename(*op_config);
+}
+
+static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.numOfParams;
+    }
+    return 0u;
+}
+
+[[maybe_unused]] static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_numparams(*op_config);
+}
+
+static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.params;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_params(*op_config);
+}
+
+static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.numOfInputs;
+    }
+    return 0u;
+}
+
+[[maybe_unused]] static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_numinputs(*op_config);
+}
+
+static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.inputTensors;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_inputs(*op_config);
+}
+
+static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.numOfOutputs;
+    }
+    return 0u;
+}
+
+[[maybe_unused]] static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_numoutputs(*op_config);
+}
+
+static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t & op_config) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        return op_config.v1.outputTensors;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t * op_config) {
+    return get_qnn_op_config_outputs(*op_config);
+}
+
+static inline void set_qnn_op_config_name(Qnn_OpConfig_t & op_config, const char * name) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        op_config.v1.name = name;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_op_config_name(Qnn_OpConfig_t * op_config, const char * name) {
+    set_qnn_op_config_name(*op_config, name);
+}
+
+static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t & op_config, const char * package_name) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        op_config.v1.packageName = package_name;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t * op_config, const char * package_name) {
+    set_qnn_op_config_packagename(*op_config, package_name);
+}
+
+static inline void set_qnn_op_config_typename(Qnn_OpConfig_t & op_config, const char * type_name) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        op_config.v1.typeName = type_name;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_op_config_typename(Qnn_OpConfig_t * op_config, const char * type_name) {
+    set_qnn_op_config_typename(*op_config, type_name);
+}
+
+static inline void set_qnn_op_config_params(Qnn_OpConfig_t & op_config,
+                                 uint32_t num_of_params,
+                                 Qnn_Param_t * params) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        op_config.v1.numOfParams = num_of_params;
+        op_config.v1.params      = params;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_op_config_params(Qnn_OpConfig_t * op_config,
+                                 uint32_t num_of_params,
+                                 Qnn_Param_t * params) {
+    set_qnn_op_config_params(*op_config, num_of_params, params);
+}
+
+static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t & op_config,
+                                 uint32_t num_of_inputs,
+                                 Qnn_Tensor_t * input_tensors) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        op_config.v1.numOfInputs  = num_of_inputs;
+        op_config.v1.inputTensors = input_tensors;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t * op_config,
+                                 uint32_t num_of_inputs,
+                                 Qnn_Tensor_t * input_tensors) {
+    set_qnn_op_config_inputs(*op_config, num_of_inputs, input_tensors);
+}
+
+static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t & op_config,
+                                  uint32_t num_of_outputs,
+                                  Qnn_Tensor_t * output_tensors) {
+    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
+        op_config.v1.numOfOutputs  = num_of_outputs;
+        op_config.v1.outputTensors = output_tensors;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t * op_config,
+                                  uint32_t num_of_outputs,
+                                  Qnn_Tensor_t * output_tensors) {
+    set_qnn_op_config_outputs(*op_config, num_of_outputs, output_tensors);
+}
+
+static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.id;
+    }
+
+    return 0u;
+}
+
+[[maybe_unused]] static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensorid(*tensor);
+}
+
+static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.name;
+    }
+    return nullptr;
+}
+
+static inline const char * get_qnn_tensorname(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensorname(*tensor);
+}
+
+static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.type;
+    }
+    return QNN_TENSOR_TYPE_UNDEFINED;
+}
+
+[[maybe_unused]] static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensortype(*tensor);
+}
+
+static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.dataFormat;
+    }
+    return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER;
+}
+
+[[maybe_unused]] static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_dataformat(*tensor);
+}
+
+static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.dataType;
+    }
+    return QNN_DATATYPE_UNDEFINED;
+}
+
+[[maybe_unused]] static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_datatype(*tensor);
+}
+
+static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.quantizeParams;
+    }
+    return QNN_QUANTIZE_PARAMS_INIT;
+}
+
+[[maybe_unused]] static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_quantparams(*tensor);
+}
+
+static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.rank;
+    }
+    return 0u;
+}
+
+[[maybe_unused]] static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_rank(*tensor);
+}
+
+static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.dimensions;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_dimensions(*tensor);
+}
+
+static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.memType;
+    }
+    return QNN_TENSORMEMTYPE_UNDEFINED;
+}
+
+[[maybe_unused]] static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_memtype(*tensor);
+}
+
+static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.clientBuf;
+    }
+    return QNN_CLIENT_BUFFER_INIT;
+}
+
+[[maybe_unused]] static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_clientbuf(*tensor);
+}
+
+static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.memHandle;
+    }
+    return nullptr;
+}
+
+[[maybe_unused]] static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t * tensor) {
+    return get_qnn_tensor_memhandle(*tensor);
+}
+
+static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.id = id;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_id(Qnn_Tensor_t * tensor, uint32_t id) {
+    set_qnn_tensor_id(*tensor, id);
+}
+
+static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.name = name;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_name(Qnn_Tensor_t * tensor, const char * name) {
+    set_qnn_tensor_name(*tensor, name);
+}
+
+static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.type = type;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_type(Qnn_Tensor_t * tensor, Qnn_TensorType_t type) {
+    set_qnn_tensor_type(*tensor, type);
+}
+
+static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.dataFormat = format;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t * tensor, Qnn_TensorDataFormat_t format) {
+    set_qnn_tensor_dataformat(*tensor, format);
+}
+
+static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.dataType = dataType;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_datatype(Qnn_Tensor_t * tensor, Qnn_DataType_t dataType) {
+    set_qnn_tensor_datatype(*tensor, dataType);
+}
+
+static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.quantizeParams = params;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t * tensor, Qnn_QuantizeParams_t params) {
+    set_qnn_tensor_quantparams(*tensor, params);
+}
+
+static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.rank = rank;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_rank(Qnn_Tensor_t * tensor, uint32_t rank) {
+    set_qnn_tensor_rank(*tensor, rank);
+}
+
+static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.dimensions = dims;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t * tensor, uint32_t * dims) {
+    set_qnn_tensor_dimensions(*tensor, dims);
+}
+
+static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.memType = memType;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_memtype(Qnn_Tensor_t * tensor, Qnn_TensorMemType_t memType) {
+    set_qnn_tensor_memtype(*tensor, memType);
+}
+
+static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.clientBuf = clientBuf;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t * tensor, Qnn_ClientBuffer_t clientBuf) {
+    set_qnn_tensor_clientbuf(*tensor, clientBuf);
+}
+
+static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.memHandle = handle;
+    }
+}
+
+[[maybe_unused]] static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t * tensor, Qnn_MemHandle_t handle) {
+    set_qnn_tensor_memhandle(*tensor, handle);
+}
+
+inline static Qnn_Tensor_t qnn_tensor_init(Qnn_TensorVersion_t version) {
+    Qnn_Tensor_t tensor;
+    tensor.version = version;
+    if (version == QNN_TENSOR_VERSION_1) {
+        tensor.v1 = QNN_TENSOR_V1_INIT;
+    } else if (version == QNN_TENSOR_VERSION_2) {
+        tensor.v2 = QNN_TENSOR_V2_INIT;
+    }
+    return tensor;
+}
+
+static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
+    int err = 0;
+    VALIDATE_TENSOR_VERSION(src, err);
+
+    dst.version = src.version;
+    QNN_TENSOR_SET_NAME(
+            dst, ggmlqnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size()));
+    if (QNN_TENSOR_GET_NAME(dst) == nullptr) {
+        return 1;
+    }
+    QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src));
+    QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src));
+    QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src));
+    QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src));
+    QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src));
+
+    if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) {
+        Qnn_ClientBuffer_t client_buf = {nullptr, 0};
+        QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf);
+    } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) {
+        QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr);
+    } else {
+        return 1;
+    }
+
+    Qnn_QuantizeParams_t src_qparam      = QNN_TENSOR_GET_QUANT_PARAMS(src);
+    Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding;
+    if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+        Qnn_QuantizeParams_t src_qparam_cpy      = src_qparam;
+        Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding;
+        Qnn_ScaleOffset_t ** scale_offset          = &axis_scale_offset.scaleOffset;
+        size_t scale_offset_size = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t);
+        *scale_offset           = (Qnn_ScaleOffset_t *)malloc(scale_offset_size);
+        ggmlqnn_memscpy(*scale_offset,
+                        scale_offset_size,
+                        src_qparam.axisScaleOffsetEncoding.scaleOffset,
+                        scale_offset_size);
+        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy);
+    } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
+        Qnn_QuantizeParams_t src_qparam_cpy          = src_qparam;
+        Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding;
+        size_t scale_size                          = bwaxis_scale_offset.numElements * sizeof(float);
+        float ** scales                            = &bwaxis_scale_offset.scales;
+        int32_t ** offsets                         = &bwaxis_scale_offset.offsets;
+        *scales                                    = (float *)malloc(scale_size);
+        ggmlqnn_memscpy(*scales, scale_size, src_qparam.bwAxisScaleOffsetEncoding.scales, scale_size);
+
+        if (bwaxis_scale_offset.offsets != nullptr) {
+            size_t offset_size = bwaxis_scale_offset.numElements * sizeof(int32_t);
+            *offsets           = (int32_t *)malloc(offset_size);
+            ggmlqnn_memscpy(*offsets, offset_size, src_qparam.bwAxisScaleOffsetEncoding.offsets, offset_size);
+        }
+        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy);
+    } else {
+        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam);
+    }
+
+    uint32_t rank = QNN_TENSOR_GET_RANK(src);
+    QNN_TENSOR_SET_RANK(dst, rank);
+    size_t dim_size       = rank * sizeof(uint32_t);
+    uint32_t * dimensions = (uint32_t *)malloc(dim_size);
+    GGMLQNN_LOG_DEBUG("tensor dims %p", dimensions);
+    if (dimensions == nullptr) {
+        GGMLQNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src));
+        return 1;
+    }
+    ggmlqnn_memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size);
+    QNN_TENSOR_SET_DIMENSIONS(dst, dimensions);
+
+    return err;
+}
+
+static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
+    int err = 0;
+    VALIDATE_TENSOR_VERSION(*tensor, err);
+    free((void *) QNN_TENSOR_GET_NAME(*tensor));
+
+    Qnn_QuantizeParams_t src_qparam      = QNN_TENSOR_GET_QUANT_PARAMS(*tensor);
+    Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding;
+    if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+        free(src_qparam.axisScaleOffsetEncoding.scaleOffset);
+    } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
+        free(src_qparam.bwAxisScaleOffsetEncoding.scales);
+        if (src_qparam.bwAxisScaleOffsetEncoding.offsets != nullptr) {
+            free(src_qparam.bwAxisScaleOffsetEncoding.offsets);
+        }
+    }
+    free(QNN_TENSOR_GET_DIMENSIONS(*tensor));
+    free(tensor);
+
+    return err;
+}
+
+
+static size_t qnn_datatype_size(Qnn_DataType_t qnn_type) {
+    switch (qnn_type) {
+        case QNN_DATATYPE_FLOAT_32:
+            return sizeof(float);
+        case QNN_DATATYPE_FLOAT_16:
+            return sizeof(uint16_t);
+        case QNN_DATATYPE_UINT_32:
+        case QNN_DATATYPE_INT_32:
+            return sizeof(int32_t);
+        case QNN_DATATYPE_INT_16:
+            return sizeof(int16_t);
+        case QNN_DATATYPE_INT_8:
+            return sizeof(int8_t);
+        case QNN_DATATYPE_SFIXED_POINT_8:
+            return sizeof(int8_t);
+        case QNN_DATATYPE_SFIXED_POINT_4:
+            return sizeof(int8_t);
+        default:
+            break;
+    }
+    return 0;
+}
+
+static const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type) {
+    switch (qnn_type) {
+        case QNN_DATATYPE_FLOAT_32:
+            return "QNN_DATATYPE_FLOAT_32";
+        case QNN_DATATYPE_FLOAT_16:
+            return "QNN_DATATYPE_FLOAT_16";
+        case QNN_DATATYPE_UINT_32:
+            return "QNN_DATATYPE_UINT_32";
+        case QNN_DATATYPE_INT_32:
+            return "QNN_DATATYPE_INT_32";
+        case QNN_DATATYPE_INT_16:
+            return "QNN_DATATYPE_INT_16";
+        case QNN_DATATYPE_INT_8:
+            return "QNN_DATATYPE_INT_8";
+        case QNN_DATATYPE_SFIXED_POINT_8:
+            return "QNN_DATATYPE_SFIXED_POINT_8";
+        case QNN_DATATYPE_SFIXED_POINT_4:
+            return "QNN_DATATYPE_SFIXED_POINT_4";
+        default:
+            break;
+    }
+    return "QNN_DATATYPE_UNDEFINED";
+}
+
+static const char * qnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
+    // file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/api_error_codes.html
+    switch (qnn_error_code) {
+        case QNN_SUCCESS:
+            return "QNN_SUCCESS";
+        case QNN_COMMON_ERROR_GENERAL:
+            return "QNN_COMMON_ERROR_GENERAL";
+
+            // QnnGraph_Error_t
+        case QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE:
+            return "QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE";
+        case QNN_GRAPH_ERROR_MEM_ALLOC:
+            return "QNN_GRAPH_ERROR_MEM_ALLOC";
+        case QNN_GRAPH_ERROR_INVALID_ARGUMENT:
+            return "QNN_GRAPH_ERROR_INVALID_ARGUMENT";
+        case QNN_GRAPH_ERROR_INVALID_HANDLE:
+            return "QNN_GRAPH_ERROR_INVALID_HANDLE";
+        case QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST:
+            return "QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST";
+        case QNN_GRAPH_ERROR_INVALID_NAME:
+            return "QNN_GRAPH_ERROR_INVALID_NAME";
+        case QNN_GRAPH_ERROR_INVALID_TENSOR:
+            return "QNN_GRAPH_ERROR_INVALID_TENSOR";
+        case QNN_GRAPH_ERROR_INVALID_OP_CONFIG:
+            return "QNN_GRAPH_ERROR_INVALID_OP_CONFIG";
+        case QNN_GRAPH_ERROR_SET_PROFILE:
+            return "QNN_GRAPH_ERROR_SET_PROFILE";
+        case QNN_GRAPH_ERROR_UNCONNECTED_NODE:
+            return "QNN_GRAPH_ERROR_UNCONNECTED_NODE";
+        case QNN_GRAPH_ERROR_CREATE_FAILED:
+            return "QNN_GRAPH_ERROR_CREATE_FAILED";
+        case QNN_GRAPH_ERROR_OPTIMIZATION_FAILED:
+            return "QNN_GRAPH_ERROR_OPTIMIZATION_FAILED";
+        case QNN_GRAPH_ERROR_FINALIZE_FAILED:
+            return "QNN_GRAPH_ERROR_FINALIZE_FAILED";
+        case QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED:
+            return "QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED";
+        case QNN_GRAPH_ERROR_GRAPH_FINALIZED:
+            return "QNN_GRAPH_ERROR_GRAPH_FINALIZED";
+        case QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL:
+            return "QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL";
+        case QNN_GRAPH_ERROR_SIGNAL_IN_USE:
+            return "QNN_GRAPH_ERROR_SIGNAL_IN_USE";
+        case QNN_GRAPH_ERROR_ABORTED:
+            return "QNN_GRAPH_ERROR_ABORTED";
+        case QNN_GRAPH_ERROR_PROFILE_IN_USE:
+            return "QNN_GRAPH_ERROR_PROFILE_IN_USE";
+        case QNN_GRAPH_ERROR_TIMED_OUT:
+            return "QNN_GRAPH_ERROR_TIMED_OUT";
+        case QNN_GRAPH_ERROR_SUBGRAPH:
+            return "QNN_GRAPH_ERROR_SUBGRAPH";
+        case QNN_GRAPH_ERROR_DISABLED:
+            return "QNN_GRAPH_ERROR_DISABLED";
+        case QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE:
+            return "QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE";
+        case QNN_GRAPH_ERROR_TENSOR_SPARSITY:
+            return "QNN_GRAPH_ERROR_TENSOR_SPARSITY";
+        case QNN_GRAPH_ERROR_EARLY_TERMINATION:
+            return "QNN_GRAPH_ERROR_EARLY_TERMINATION";
+        case QNN_GRAPH_ERROR_INVALID_CONTEXT:
+            return "QNN_GRAPH_ERROR_INVALID_CONTEXT";
+
+            //QQnnTensor_Error_t
+            //Invalid context/graph handle in creating tensor
+        case QNN_TENSOR_ERROR_INVALID_HANDLE:
+            return "QNN_TENSOR_ERROR_INVALID_HANDLE";
+            //Tensor with specified credentials not registered with a context/graph
+        case QNN_TENSOR_ERROR_DOES_NOT_EXIST:
+            return "QNN_TENSOR_ERROR_DOES_NOT_EXIST";
+            // (deprecated) Tensor has already been registered with backend
+        case QNN_TENSOR_ERROR_ALREADY_EXISTS:
+            return "QNN_TENSOR_ERROR_ALREADY_EXISTS";
+            // Invalid tensor param.
+        case QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM:
+            return "QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM";
+            // This tensor param is currently unsupported
+        case QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM:
+            return "QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM";
+            // Tensor provided for update is invalid
+        case QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE:
+            return "QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE";
+
+            // QnnOpPackage_Error_t
+        case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED:
+            return "QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED";
+        case QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED:
+            return "QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED";
+        case QNN_OP_PACKAGE_ERROR_INVALID_HANDLE:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_HANDLE";
+        case QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE";
+        case QNN_OP_PACKAGE_ERROR_INVALID_INFO:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_INFO";
+        case QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE:
+            return "QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE";
+        case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT";
+
+        default:
+            return "unknown QNN error";
+    }
+}
+
+// =================================================================================================
+//  section-5:ggml-qnn backend helper macro / data structure / function / class
+// =================================================================================================
+#define RPCMEM_DEFAULT_FLAGS                    1
+#define RPCMEM_HEAP_ID_SYSTEM                   25
+
+typedef void (* ggmlqnn_op_func_t)(ggml_backend_t backend, ggml_tensor * op);
+
+using pfn_rpc_mem_init                                  = void (*)(void);
+using pfn_rpc_mem_deinit                                = void (*)(void);
+using pfn_rpc_mem_alloc                                 = void *(*)(int, uint32_t, int);
+using pfn_rpc_mem_free                                  = void (*)(void *);
+using pfn_rpc_mem_to_fd                                 = int (*)(void *);
+using _pfn_QnnSaver_initialize                          = decltype(QnnSaver_initialize);
+using _pfn_QnnInterface_getProviders                    = decltype(QnnInterface_getProviders);
+using _pfn_QnnSystemInterface_getProviders              = decltype(QnnSystemInterface_getProviders);
+
+enum class ggml_qnn_profile_level {
+    profile_off     = 0,
+    profile_basic   = 1,
+    profile_detail  = 2
+};
+
+enum qcom_htp_arch {
+    NONE = 0,
+    V68 = 68,
+    V69 = 69,
+    V73 = 73,
+    V75 = 75,
+    V79 = 79,
+};
+
+enum qcom_chipset_soc_model {
+    UNKNOWN_SM = 0,
+    SM7450 = 41,  // v69, 7 Gen1
+    SM8350 = 30,  // v68, 888
+    SM8450 = 36,  // v69, SD 8 Gen 1
+    SM8475 = 42,  // v69, SD 8+ Gen 1
+    SM8550 = 43,  // v73, SD 8 Gen 2
+    SM8650 = 57,  // v75, SD 8 Gen 3
+    SM8750 = 69,  // v79, SD 8 Gen 4
+};
+
+struct qcom_socinfo {
+    uint32_t soc_model;
+    size_t htp_arch;
+    size_t vtcm_size_in_mb;
+    char soc_desc[GGML_MAX_NAME];
+};
+
+//file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
+static struct qcom_socinfo g_qnn_soc_info_table[] = {
+        /* Qualcomm SnapDragon 7 Gen 1 */
+        [SM7450] = {
+                .soc_model         = SM7450,
+                .htp_arch          = V69,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 7 Gen 1"},
+
+        /* Qualcomm SnapDragon 888 */
+        [SM8350] = {
+                .soc_model         = SM8350,
+                .htp_arch          = V68,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 888 "},
+
+        /* Qualcomm SnapDragon 8 Gen 1 */
+        [SM8450] = {
+                .soc_model         = SM8450,
+                .htp_arch          = V69,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 1"},
+
+        /* Qualcomm SnapDragon 8 Gen 1+ */
+        [SM8475] = {
+                .soc_model         = SM8475,
+                .htp_arch          = V69,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 1+"},
+
+        /* Qualcomm SnapDragon 8 Gen 2 */
+        [SM8550] = {
+                .soc_model         = SM8550,
+                .htp_arch          = V73,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 2"},
+
+        /* Qualcomm SnapDragon 8 Gen 3 */
+        [SM8650] = {
+                .soc_model         = SM8650,
+                .htp_arch          = V75,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 3 "},
+
+        /* Qualcomm SnapDragon 8 Gen 4 */
+        [SM8750] = {
+                .soc_model         = SM8750,
+                .htp_arch          = V79,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 4"},
+
+};
+
+struct ggml_backend_qnn_context {
+    int device;
+    int threads;
+    char name[GGML_MAX_NAME];
+    char desc[GGML_MAX_NAME];
+    char lib[GGML_MAX_NAME];
+    qnn_instance * instance;
+    struct ggml_backend * backend;
+    QNN_INTERFACE_VER_TYPE raw_interface;
+    QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
+    struct qcom_socinfo           socinfo;
+
+    //FIXME: should I move it from public member of class qnn_instance to here?
+    //std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *, Qnn_Tensor_t *, Qnn_Tensor_t *>> _qnn_graph_map;
+} ;
+
+//FIXME: the following global vars and three helper funcs should be removed in the future
+static int32_t  g_ggmltensor_idx    = 0;
+static void reset_idx() {
+    g_ggmltensor_idx = 0;
+}
+
+static void inc_idx() {
+    g_ggmltensor_idx++;
+}
+
+static int32_t get_idx() {
+    return g_ggmltensor_idx;
+}
+
+// file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/quantization.html
+// CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend
+// GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend
+// HTP - Choose a quantized model. Quantized models are required when running on the HTP backend
+// DSP - Choose a quantized model. Quantized models are required when running on the DSP backend
+// HTA - Choose a quantized model. Quantized models are required when running on the HTA backend
+static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
+        [QNN_BACKEND_CPU] = {.device               = 0,
+                .threads              = 1,
+                .name                 = "qnn-cpu",
+                .desc                 = "Qualcomm Kryo CPU",
+                .lib                  = "libQnnCpu.so",
+                .instance             = nullptr,
+                .backend              = nullptr,
+                .raw_interface        = {},
+                .raw_system_interface = {},
+                .socinfo              = {}},
+
+        [QNN_BACKEND_GPU] = {.device               = 1,
+                .threads              = 1,
+                .name                 = "qnn-gpu",
+                .desc                 = "Qualcomm Adreno GPU",
+                .lib                  = "libQnnGpu.so",
+                .instance             = nullptr,
+                .backend              = nullptr,
+                .raw_interface        = {},
+                .raw_system_interface = {},
+                .socinfo              = {}},
+
+        [QNN_BACKEND_NPU] = {.device               = 2,
+                .threads              = 1,
+                .name                 = "qnn-npu",
+                .desc                 = "Qualcomm NPU(Hexagon Tensor Processor)",
+                .lib                  = "libQnnHtp.so",
+                .instance             = nullptr,
+                .backend              = nullptr,
+                .raw_interface        = {},
+                .raw_system_interface = {},
+                .socinfo              = {}},
+};
+
+
+struct qnn_op_caps_t {
+    const char * qnn_op_name = nullptr;
+    const size_t input_param_count = 0;
+    const char * qnn_param_name = nullptr;
+};
+
+static const qnn_op_caps_t kOpCaps[] = {
+        {}, // GGML_OP_NONE
+        {}, // GGML_OP_DUP
+        {
+                // GGML_OP_ADD
+                QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name
+                2,                   // input_param_count
+        },
+        {}, // GGML_OP_ADD1
+        {}, // GGML_OP_ACC
+        {}, // GGML_OP_SUB
+        {}, // GGML_OP_MUL
+        {}, // GGML_OP_DIV
+        {}, // GGML_OP_SQR
+        {}, // GGML_OP_SQRT
+        {}, // GGML_OP_LOG
+        {}, // GGML_OP_SIN
+        {}, // GGML_OP_COS
+        {}, // GGML_OP_SUM
+        {}, // GGML_OP_SUM_ROWS
+        {}, // GGML_OP_MEAN
+        {}, // GGML_OP_ARGMAX
+        {}, // GGML_OP_COUNT_EQUAL
+        {}, // GGML_OP_REPEAT
+        {}, // GGML_OP_REPEAT_BACK
+        {}, // GGML_OP_CONCAT
+        {}, // GGML_OP_SILU_BACK
+        {}, // GGML_OP_NORM
+        {}, // GGML_OP_RMS_NORM
+        {}, // GGML_OP_RMS_NORM_BACK
+        {}, // GGML_OP_GROUP_NORM
+        {
+                // GGML_OP_MUL_MAT
+                QNN_OP_MAT_MUL,  // qnn_op_name
+                2,               // input_param_count
+        },
+        {}, // GGML_OP_MUL_MAT_ID
+        {}, // GGML_OP_OUT_PROD
+        {}, // GGML_OP_SCALE
+        {}, // GGML_OP_SET
+        {}, // GGML_OP_CPY
+        {}, // GGML_OP_CONT
+        {}, // GGML_OP_RESHAPE
+        {}, // GGML_OP_VIEW
+        {}, // GGML_OP_PERMUTE
+        {}, // GGML_OP_TRANSPOSE
+        {}, // GGML_OP_GET_ROWS
+        {}, // GGML_OP_GET_ROWS_BACK
+        {}, // GGML_OP_DIAG
+        {}, // GGML_OP_DIAG_MASK_INF
+        {}, // GGML_OP_DIAG_MASK_ZERO
+        {}, // GGML_OP_SOFT_MAX
+        {}, // GGML_OP_SOFT_MAX_BACK
+        {}, // GGML_OP_ROPE
+        {}, // GGML_OP_ROPE_BACK
+        {}, // GGML_OP_CLAMP
+        {}, // GGML_OP_CONV_TRANSPOSE_1D
+        {}, // GGML_OP_IM2COL
+        {}, // GGML_OP_IM2COL_BACK
+        {}, // GGML_OP_CONV_TRANSPOSE_2D
+        {}, // GGML_OP_POOL_1D
+        {}, // GGML_OP_POOL_2D
+        {}, // GGML_OP_POOL_2D_BACK
+        {}, // GGML_OP_UPSCALE
+        {}, // GGML_OP_PAD
+        {}, // GGML_OP_PAD_REFLECT_1D
+        {}, // GGML_OP_ARANGE
+        {}, // GGML_OP_TIMESTEP_EMBEDDING
+        {}, // GGML_OP_ARGSORT
+        {}, // GGML_OP_LEAKY_RELU
+        {}, // GGML_OP_FLASH_ATTN_EXT
+        {}, // GGML_OP_FLASH_ATTN_BACK
+        {}, // GGML_OP_SSM_CONV
+        {}, // GGML_OP_SSM_SCAN
+        {}, // GGML_OP_WIN_PART
+        {}, // GGML_OP_WIN_UNPART
+        {}, // GGML_OP_GET_REL_POS
+        {}, // GGML_OP_ADD_REL_POS
+        {}, // GGML_OP_RWKV_WKV6
+        {}, // GGML_OP_GATED_LINEAR_ATTN
+        {}, // GGML_OP_UNARY
+        {}, // GGML_OP_MAP_UNARY
+        {}, // GGML_OP_MAP_BINARY
+        {}, // GGML_OP_MAP_CUSTOM1_F32
+        {}, // GGML_OP_MAP_CUSTOM2_F32
+        {}, // GGML_OP_MAP_CUSTOM3_F32
+        {}, // GGML_OP_MAP_CUSTOM1
+        {}, // GGML_OP_MAP_CUSTOM2
+        {}, // GGML_OP_MAP_CUSTOM3
+        {}, // GGML_OP_CROSS_ENTROPY_LOSS
+        {}, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
+        {}, // GGML_OP_OPT_STEP_ADAMW
+        {}, // GGML_UNARY_OP_ABS
+        {}, // GGML_UNARY_OP_SGN
+        {}, // GGML_UNARY_OP_NEG
+        {}, // GGML_UNARY_OP_STEP
+        {}, // GGML_UNARY_OP_TANH
+        {}, // GGML_UNARY_OP_ELU
+        {}, // GGML_UNARY_OP_RELU
+        {}, // GGML_UNARY_OP_SIGMOID
+        {}, // GGML_UNARY_OP_GELU
+        {}, // GGML_UNARY_OP_GELU_QUICK
+        {}, // GGML_UNARY_OP_SILU
+        {}, // GGML_UNARY_OP_HARDSWISH
+        {}, // GGML_UNARY_OP_HARDSIGMOID
+        {}, // GGML_UNARY_OP_EXP
+};
+
+static const char * qnn_get_socmodel_desc(uint32_t soc_model) {
+    switch (soc_model) {
+        case SM7450:
+            return "SM7450";
+        case SM8350:
+            return "SM8350";
+        case SM8450:
+            return "SM8450";
+        case SM8475:
+            return "SM8475";
+        case SM8550:
+            return "SM8550";
+        case SM8650:
+            return "SM8650";
+        case SM8750:
+            return "SM8750";
+        default:
+            return "unknown";
+    }
+}
+
+static const char * qnn_get_htparch_desc(size_t htp_arch) {
+    switch (htp_arch) {
+        case V68:
+            return "QCOM_HTP_V68";
+        case V69:
+            return "QCOM_HTP_V69";
+        case V73:
+            return "QCOM_HTP_V73";
+        case V75:
+            return "QCOM_HTP_V75";
+        case V79:
+            return "QCOM_HTP_V79";
+        default:
+            return "unknown";
+    }
+}
+
+static struct qcom_socinfo * qnn_get_socinfo_from_socmodel(uint32_t soc_model) {
+    size_t items = sizeof(g_qnn_soc_info_table) / sizeof(g_qnn_soc_info_table[0]);
+    for (size_t idx = 0; idx < items; idx++) {
+        if (soc_model == g_qnn_soc_info_table[idx].soc_model) {
+            return &g_qnn_soc_info_table[idx];
+        }
+    }
+    return nullptr;
+}
+
+static bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
+                                    const ggml_tensor * src1, ggml_tensor * dst) {
+    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
+        GGMLQNN_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    qnn_instance * instance = ctx->instance;
+    if (nullptr == instance) {
+        GGMLQNN_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    return true;
+}
+
+#define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                          \
+    do {                                                            \
+        if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {   \
+            return;                                                 \
+        }                                                           \
+    } while (0)
+
+static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) {
+    /*
+    uint32_t rank = 0;
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) {
+            rank++;
+        }
+    }
+    return rank;
+    */
+    return ggml_n_dims(tensor);
+}
+
+static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) {
+    /*
+    size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
+    size_t n_dims = ggml_get_tensor_rank(tensor);
+    for (int i = 1; i < n_dims; i++) {
+        data_size *= tensor->ne[i];
+    }
+
+    return data_size;
+    */
+    return ggml_nbytes(tensor);
+}
+
+static const char * ggml_get_type_name(ggml_type type) {
+    const struct ggml_type_traits * traits = ggml_get_type_traits(type);
+    return traits->type_name;
+}
+
+Qnn_Tensor_t * ggml_qnn_create_tensor(const ggml_tensor * tensor) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    char tensor_name[GGML_MAX_NAME] = {0};
+
+    //FIXME:remove get_idx() and inc_idx() in the future but ensure the tensor name is unique
+    snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", get_idx());
+    GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
+    inc_idx();
+
+    uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
+                             (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
+    Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
+    Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+
+    if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
+        qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+    } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
+        qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
+    }
+    Qnn_Tensor_t qnn_tensor = {
+            .version= QNN_TENSOR_VERSION_1,
+            {.v1= {
+                    .id = 0,
+                    .name = tensor_name,
+                    .type = qnn_tensor_type,
+                    .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
+                    .dataType = qnn_data_type,
+                    .quantizeParams = {QNN_DEFINITION_UNDEFINED,
+                                       QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                                       {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}},
+                    .rank = ggml_get_tensor_rank(tensor),
+                    .dimensions = dimensions,
+                    .memType = QNN_TENSORMEMTYPE_RAW,
+                    {.clientBuf = {.data = nullptr,
+                            .dataSize = 0}}}}
+    };
+    Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
+    if (nullptr == p_qnn_tensor) {
+        GGMLQNN_LOG_WARN("calloc failed");
+        return nullptr;
+    }
+    error = deep_copy_qnn_tensors(qnn_tensor, * p_qnn_tensor);
+    if (error != QNN_SUCCESS) {
+        free(p_qnn_tensor);
+        GGMLQNN_LOG_WARN("init tensor failed");
+        return  nullptr;
+    }
+
+    return p_qnn_tensor;
+}
+
+//TODO:
+// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
+static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
+    switch (ggmltype) {
+        case GGML_TYPE_F16:
+            return QNN_DATATYPE_FLOAT_16;
+        case GGML_TYPE_F32:
+            return QNN_DATATYPE_FLOAT_32;
+        case GGML_TYPE_I8:
+            return QNN_DATATYPE_INT_8;
+        case GGML_TYPE_Q8_0:
+            return QNN_DATATYPE_SFIXED_POINT_8;
+        case GGML_TYPE_Q4_0:
+            return QNN_DATATYPE_SFIXED_POINT_4;
+        default:
+            break;
+    }
+    return QNN_DATATYPE_UNDEFINED;
+}
+
+//TODO:
+static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
+    switch (qnn_type) {
+        case QNN_DATATYPE_FLOAT_32:
+            return GGML_TYPE_F32;
+        case QNN_DATATYPE_FLOAT_16:
+            return GGML_TYPE_F16;
+        case QNN_DATATYPE_UINT_32:
+        case QNN_DATATYPE_INT_32:
+            return GGML_TYPE_I32;
+        case QNN_DATATYPE_INT_16:
+            return GGML_TYPE_I16;
+        case QNN_DATATYPE_INT_8:
+            return GGML_TYPE_I8;
+        case QNN_DATATYPE_SFIXED_POINT_8:
+            return GGML_TYPE_Q8_0;
+        case QNN_DATATYPE_SFIXED_POINT_4:
+            return GGML_TYPE_Q4_0;
+        default:
+            break;
+    }
+    return GGML_TYPE_COUNT;
+}
+
+//TODO: add more ops
+static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) {
+    switch (ggmlop) {
+        case GGML_OP_ADD:
+            return QNN_OP_ELEMENT_WISE_ADD;
+        case GGML_OP_MUL_MAT:
+            return QNN_OP_MAT_MUL;
+        default:
+            break;
+    }
+    return nullptr;
+}
+
+static const char * get_ggml_type_name(ggml_type type) {
+    const auto * traits = ggml_get_type_traits(type);
+    return traits->type_name;
+}
+
+static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) {
+    char buffer[256] = {};
+    const char * type_name = get_ggml_type_name(tensor->type);
+    int len = 0;
+    switch (ggml_n_dims(tensor)) {
+        case 1:
+            len = snprintf(buffer, sizeof(buffer), "%ldx1%s", (long)tensor->ne[0], type_name);
+            break;
+        case 2:
+            len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name);
+            break;
+        case 3:
+            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
+                           (long)tensor->ne[2], type_name);
+            break;
+        case 4:
+        default:
+            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
+                           (long)tensor->ne[2], (long)tensor->ne[3], type_name);
+            break;
+    }
+    GGML_ASSERT(len > 0 && len < (int)sizeof(buffer));
+    output.append(buffer, len);
+}
+
+constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT;
+
+static size_t get_qnn_op_index(const ggml_tensor * tensor) {
+    if (tensor->op == GGML_OP_UNARY) {
+        return kGgmlUnaryOpStart + ggml_get_unary_op(tensor);
+    }
+
+    return tensor->op;
+}
+
+static size_t get_qnn_op_input_param_count(const ggml_tensor * op) {
+    auto op_index = get_qnn_op_index(op);
+    GGML_ASSERT(op_index < std::size(kOpCaps));
+    return kOpCaps[op_index].input_param_count;
+}
+
+static void get_graph_key_from_op(const ggml_tensor * op, std::string & output) {
+    GGML_ASSERT(op->op != GGML_OP_NONE);
+    output += ggml_op_desc(op);
+    output += get_ggml_type_name(op->type);
+    size_t param_count = get_qnn_op_input_param_count(op);
+    for (size_t i = 0; i < param_count; ++i) {
+        auto * input = op->src[i];
+        if (!input) {
+            break;
+        }
+        output += '_';
+        append_tensor_dimensions(input, output);
+    }
+}
+
+#if ENABLE_QNNBACKEND_PERF
+class qnn_perf {
+public:
+    qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
+    qnn_perf() = delete;
+    qnn_perf(const qnn_perf & ) = delete;
+    qnn_perf & operator= (const qnn_perf & ) = delete;
+
+    void start() {
+        _begin_time = ggml_time_us();
+    }
+
+    void info() {
+        _end_time = ggml_time_us();
+        _duration = (_end_time - _begin_time);
+        GGMLQNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
+    }
+
+private:
+    int64_t _begin_time = 0LL;
+    int64_t _end_time   = 0LL;
+    int64_t _duration   = 0LL;
+    std::string _perf_name;
+};
+#else
+class qnn_perf {
+public:
+    qnn_perf(const std::string & perf_name) {}
+    qnn_perf() = delete;
+    qnn_perf(const qnn_perf & ) = delete;
+    qnn_perf & operator= (const qnn_perf & ) = delete;
+
+    void start() {}
+    void info() {}
+};
+#endif
+
+template<typename Fn>
+Fn load_qnn_functionpointers(void * handle, const char * function_name) {
+    return reinterpret_cast<Fn>(dlsym(handle, function_name));
+}
+
+class qnn_interface {
+
+#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name)           \
+  template <typename... Args>                                     \
+  inline auto qnn_##F(Args... args) const {                       \
+    return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \
+        std::forward<Args>(args)...);                             \
+  }
+
+
+#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name)                  \
+  template <typename... Args>                                                \
+  inline auto qnn_##F(Args... args) const {                                  \
+    return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \
+        std::forward<Args>(args)...);                                        \
+  }
+
+    friend class qnn_instance;
+
+public:
+    qnn_interface() = default;
+
+    // QnnBackend
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion);
+
+    // QnnDevice
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo);
+
+    // QnnContext
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree);
+
+    // QnnGraph
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve);
+
+    // QnnLog
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel);
+
+    // QnnProfile
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree);
+
+    // QnnMem
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister);
+
+    // QnnProperty
+    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability);
+
+    // QnnTensor
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor);
+
+    // QnnSystem
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate);
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo);
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree);
+
+    void set_qnn_interface(const QnnInterface_t * qnn_interface) {
+        _qnn_interface = qnn_interface;
+    }
+
+    void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) {
+        _qnn_sys_interface = qnn_sys_interface;
+    }
+
+    uint32_t get_backend_id() const {
+        return _qnn_interface->backendId;
+    }
+
+    bool is_loaded() const {
+        return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr));
+    }
+
+private:
+    const QnnInterface_t *_qnn_interface = nullptr;
+
+    const QnnSystemInterface_t *_qnn_sys_interface = nullptr;
+};
+
+class qnn_instance {
+public:
+    using BackendIdType = decltype(QnnInterface_t{}.backendId);
+
+    explicit qnn_instance(const std::string & lib_path, const std::string & backend_name,
+                                const std::string & model_name) :
+            _lib_path(std::move(lib_path)),
+            _backend_name(std::move(backend_name)),
+            _model_name(std::move(model_name)) {};
+
+    ~qnn_instance() {
+    }
+
+    int qnn_init(const QnnSaver_Config_t ** saver_config);
+
+    int qnn_finalize();
+
+    const qnn_interface &get_qnn_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_interface;
+    }
+
+    const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_raw_interface;
+    }
+
+    const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_raw_system_interface;
+    }
+
+    const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
+
+    const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
+
+    const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
+
+    const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
+
+    const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
+
+    const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
+
+    const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
+
+    int init_qnn_graph(const char * graph_name,
+                       bool debug,
+                       uint8_t do_node_validation = 1,
+                       const QnnGraph_Config_t ** graph_configs = nullptr
+    );
+    int init_qnn_graph(const std::string &graph_name, QNNBackend device, size_t vtcm_size_in_mb);
+
+    int finalize_qnn_graph();
+
+    bool is_valid_graph() const { return _qnn_graph_handle != nullptr; }
+
+    int init_htp_perfinfra() {
+        QnnDevice_Infrastructure_t device_infra = nullptr;
+        int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to get qnn device infra\n");
+            return 1;
+        }
+
+        QnnHtpDevice_Infrastructure_t *htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
+        QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra;
+        uint32_t power_configid = 1;
+        uint32_t device_id = 0;
+        uint32_t core_id = 0;
+        htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid);
+        _qnn_htp_perfinfra = htp_perfinfra;
+        _qnn_power_configid = power_configid;
+
+        return 0;
+    }
+
+    int set_rpc_polling() {
+        if (_qnn_rpc_pollingtime > 0) {
+            QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingtime;
+            memset(&rpc_pollingtime, 0, sizeof(rpc_pollingtime));
+            rpc_pollingtime.option =
+                    QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
+            rpc_pollingtime.rpcPollingTimeConfig = _qnn_rpc_pollingtime;
+            const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&rpc_pollingtime, nullptr};
+            if (_qnn_htp_perfinfra) {
+                _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
+            }
+        }
+        return 0;
+    }
+
+    int set_high_performance_mode() {
+        if (nullptr == _qnn_htp_perfinfra) {
+            GGMLQNN_LOG_DEBUG("perf intra is null\n");
+            return 1;
+        }
+
+        QnnHtpPerfInfrastructure_PowerConfig_t power_config;
+        memset(&power_config, 0, sizeof(power_config));
+        power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
+        power_config.dcvsV3Config.dcvsEnable = 0;
+        power_config.dcvsV3Config.setDcvsEnable = 1;
+        power_config.dcvsV3Config.contextId = _qnn_power_configid;
+        power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
+        power_config.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False
+        power_config.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False
+        power_config.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False
+        power_config.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable
+        power_config.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False
+        // set Sleep latency parameter
+        uint32_t latencyValue = 40;
+        power_config.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec
+        // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
+        power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+        power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+        power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+        // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
+        power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+        power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+        power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+        // set power config with different performance parameters
+        const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&power_config, nullptr};
+
+        _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
+
+        return 0;
+    }
+
+    std::string & get_qnn_graph_name() { return _graph_name; }
+
+    bool is_rpcmem_initialized() {
+        return _rpcmem_initialized;
+    }
+
+    void set_rpcmem_initialized(bool initialized) {
+        _rpcmem_initialized = initialized;
+    }
+
+    size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
+
+    int32_t rpcmem_to_fd(void * buf);
+
+    int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor);
+    Qnn_MemHandle_t  register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type);
+
+    void unregister_rpcmem();
+    void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
+
+    void * alloc_rpcmem(size_t bytes, size_t alignment);
+    void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle);
+
+    void free_rpcmem(void * buf);
+    void free_rpcmem();
+
+    bool is_rpcmem_allocated(void * buf);
+
+    bool is_rpcmem_registered(Qnn_MemHandle_t handle) {
+        return _qnn_mem_set.count(handle) != 0U;
+    }
+
+    bool enable_qnn_rpc() {
+        return _enable_qnn_rpc;
+    }
+
+public:
+    std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *, Qnn_Tensor_t *, Qnn_Tensor_t *>> _qnn_graph_map;
+
+private:
+    int load_system();
+
+    int unload_system();
+
+    int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config);
+
+    int unload_backend();
+
+    void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) {
+        _qnn_raw_interface = raw_interface;
+    }
+
+    void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) {
+        _qnn_raw_system_interface = raw_interface;
+    }
+
+private:
+    static constexpr const int _required_num_providers = 1;
+
+private:
+    std::string _lib_path;
+    std::string _backend_name;
+    std::string _model_name;               // name of prebuilt QNN model, might be used in the future
+    BackendIdType _backend_id;
+
+    bool _debug_tensor                      = false; // flag to indicate if requested graph is to be run in debug mode
+    bool _do_node_validations               = true;  // flag to indicate whether all add_node calls need to be validated
+    QnnLog_Level_t _qnn_log_level           = QNN_LOG_LEVEL_DEBUG;
+
+    ggml_qnn_profile_level _profile_level   = ggml_qnn_profile_level::profile_detail;
+
+    qnn_interface _qnn_interface;
+
+    void *_system_lib_handle = nullptr;
+
+    Qnn_GraphHandle_t _qnn_graph_handle = nullptr;
+
+    Qnn_LogHandle_t _qnn_log_handle = nullptr;
+
+    Qnn_ProfileHandle_t _qnn_profile_handle = nullptr;
+
+    Qnn_DeviceHandle_t _qnn_device_handle = nullptr;
+
+    Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
+
+    Qnn_ContextHandle_t _qnn_context_handle = nullptr;
+
+    QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
+
+    QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr;
+    uint32_t _qnn_power_configid = 1;
+    uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing
+
+    QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
+    QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
+
+    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
+    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
+
+
+    static std::mutex _init_mutex;
+    static std::unordered_map<BackendIdType, void *> _loaded_lib_handle;
+    static std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
+    static std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
+
+    void * _rpc_lib_handle = nullptr;
+    std::atomic_bool _rpcmem_initialized{false};
+    pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
+    pfn_rpc_mem_free _pfn_rpc_mem_free;
+    pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
+    pfn_rpc_mem_init  _pfn_rpc_mem_init;
+    pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
+    std::unordered_map<void *, void *> _rpcmem_store_map;
+    size_t                             _rpcmem_capacity = 512;
+
+    std::string _graph_name;
+    QNNBackend _device_id;
+    bool       _enable_qnn_rpc = false; //FIXME:unknown issue with QNN RPC feature
+
+    DISABLE_COPY(qnn_instance);
+    DISABLE_MOVE(qnn_instance);
+};
+
+std::mutex qnn_instance::_init_mutex;
+std::unordered_map<qnn_instance::BackendIdType, void *> qnn_instance::_loaded_lib_handle;
+std::unordered_map<std::string, qnn_instance::BackendIdType> qnn_instance::_lib_path_to_backend_id;
+std::unordered_map<qnn_instance::BackendIdType, const QnnInterface_t *> qnn_instance::_loaded_backend;
+
+void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
+    if (!_rpcmem_initialized) {
+        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
+        return nullptr;
+    }
+
+    auto allocate_bytes = static_cast<int32_t>(bytes + alignment);
+    void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes);
+    if (buf == nullptr) {
+        GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
+        return nullptr;
+    }
+
+    auto aligned_buf = reinterpret_cast<void *>(ggmlqnn_align_to(alignment,
+                                                         reinterpret_cast<intptr_t>(buf)));
+    bool status = _rpcmem_store_map.insert(std::pair<void *, void *>(aligned_buf, buf)).second;
+    if (!status) {
+        GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
+        _pfn_rpc_mem_free(buf);
+    }
+
+    return aligned_buf;
+}
+
+void qnn_instance::free_rpcmem(void * buf) {
+    if (!_rpcmem_initialized) {
+        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
+    } else if (0 == _rpcmem_store_map.count(buf)) {
+        GGMLQNN_LOG_WARN("no allocated tensor\n");
+    } else {
+        GGMLQNN_LOG_DEBUG("free rpc mem %p", _rpcmem_store_map[buf]);
+        _pfn_rpc_mem_free(_rpcmem_store_map[buf]);
+        _rpcmem_store_map.erase(buf);
+    }
+}
+
+void qnn_instance::free_rpcmem() {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+
+    if (_rpcmem_store_map.empty()) {
+        GGMLQNN_LOG_WARN("no rpcmem allocated\n");
+        return;
+    }
+
+    for (std::unordered_map<void *, void *>::iterator it = _rpcmem_store_map.begin();
+         it != _qnn_mem_set.end();
+         it++) {
+        void * rpcbuffer = it->second;
+        GGMLQNN_LOG_DEBUG("free rpc buffer %p", rpcbuffer);
+        _pfn_rpc_mem_free(rpcbuffer);
+    }
+    _rpcmem_store_map.clear();
+}
+
+int32_t qnn_instance::rpcmem_to_fd(void * buf) {
+    int32_t mem_fd = -1;
+    if (!is_rpcmem_initialized()) {
+        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
+    } else {
+        mem_fd = _pfn_rpc_mem_to_fd(buf);
+    }
+
+    return mem_fd;
+}
+
+int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
+    if (nullptr == p_data || (nullptr == p_tensor)) {
+        GGMLQNN_LOG_WARN("invalid param\n");
+        return 1;
+    }
+
+    if (!is_rpcmem_initialized()) {
+        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
+        return 2;
+    }
+
+    if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
+        GGMLQNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
+        return 4;
+    }
+
+    int32_t mem_fd = rpcmem_to_fd(p_data);
+    if (-1 == mem_fd) {
+        GGMLQNN_LOG_WARN("failed to get file descriptor\n");
+        return 5;
+    }
+    GGMLQNN_LOG_DEBUG("mem_fd %d\n", mem_fd);
+    Qnn_MemDescriptor_t descriptor = {
+            {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr},
+            QNN_VER_PTR(*p_tensor)->dataType,
+            QNN_MEM_TYPE_ION,
+            {{mem_fd}}};
+    Qnn_MemHandle_t handle = nullptr;
+    int error = QNN_SUCCESS;
+    error = _qnn_interface.qnn_mem_register(
+            _qnn_context_handle,
+            &descriptor,
+            /*numDescriptors=*/1,
+            &handle);
+    if (error != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error),
+              strerror(error));
+        return 6;
+    } else {
+        GGMLQNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
+    }
+    QNN_VER_PTR(*p_tensor)->memHandle = handle;
+    _qnn_mem_set.insert((std::pair<void*, Qnn_MemHandle_t>(p_data, handle)));
+
+    return 0;
+}
+
+Qnn_MemHandle_t  qnn_instance::register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type) {
+    if (!p_data) {
+        GGMLQNN_LOG_WARN("invalid param");
+        return nullptr;
+    }
+
+    if (!is_rpcmem_initialized()) {
+        GGMLQNN_LOG_WARN("rpc memory not initialized");
+        return nullptr;
+    }
+
+    if (is_rpcmem_registered(p_data)) {
+        GGMLQNN_LOG_WARN("rpc memory already registered");
+        return _qnn_rpc_buffer_to_handles[p_data];
+    }
+
+    auto mem_fd = rpcmem_to_fd(p_data);
+    if (mem_fd == -1) {
+        GGMLQNN_LOG_WARN("failed to get file descriptor");
+        return nullptr;
+    }
+
+    GGMLQNN_LOG_DEBUG("mem_fd %d", mem_fd);
+    Qnn_MemDescriptor_t descriptor = {{rank, dimensions, nullptr}, data_type, QNN_MEM_TYPE_ION, {{mem_fd}}};
+    Qnn_MemHandle_t handle = nullptr;
+    auto error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor,
+            /*numDescriptors=*/1, &handle);
+    if (error != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error));
+        return nullptr;
+    }
+
+    _qnn_rpc_buffer_to_handles.insert({p_data, handle});
+    GGMLQNN_LOG_DEBUG("successfully register shared memory handler: %p", handle);
+    return handle;
+}
+
+void * qnn_instance::get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) {
+    for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin();
+         it != _qnn_mem_set.end();
+         it++) {
+        Qnn_MemHandle_t mem_handle = it->second;
+        if (it->second == mem_handle) {
+            return it->first;
+        }
+    }
+    GGMLQNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle);
+    return nullptr;
+}
+
+void qnn_instance::unregister_rpcmem() {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+
+    if (_qnn_mem_set.empty()) {
+        GGMLQNN_LOG_WARN("no rpcmem registered\n");
+    }
+
+    for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin();
+         it != _qnn_mem_set.end();
+         it++) {
+        Qnn_MemHandle_t mem_handle = it->second;
+        error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d\n",
+                         QNN_GET_ERROR_CODE(error));
+        } else {
+            GGMLQNN_LOG_DEBUG("unregister shared memory ok");
+        }
+    }
+    _qnn_mem_set.clear();
+}
+
+void qnn_instance::unregister_rpcmem(Qnn_MemHandle_t mem_handle) {
+    Qnn_ErrorHandle_t error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
+    if (error != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error));
+    }
+
+    auto it = std::find_if(_qnn_mem_set.begin(), _qnn_mem_set.end(),
+                           [mem_handle](const auto &kv) { return kv.second == mem_handle; });
+    if (it == _qnn_mem_set.end()) {
+        GGMLQNN_LOG_WARN("failed to find shared memory handler: %p", mem_handle);
+        return;
+    }
+
+    _qnn_mem_set.erase(it);
+}
+
+bool qnn_instance::is_rpcmem_allocated(void * buf) {
+    return _rpcmem_store_map.count(buf) != 0U;
+}
+
+int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    GGMLQNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str());
+
+    void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+    if (nullptr == lib_handle) {
+        GGMLQNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror());
+        return 1;
+    }
+
+    auto get_providers =
+            load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(lib_handle,
+                                                          "QnnInterface_getProviders");
+    if (nullptr == get_providers) {
+        GGMLQNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror());
+        return 2;
+    }
+
+    // get QnnInterface Providers
+    std::uint32_t num_providers = 0;
+    const QnnInterface_t **provider_list = nullptr;
+    error = get_providers(&provider_list, &num_providers);
+    if (error != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error));
+        return 3;
+    }
+    GGMLQNN_LOG_DEBUG("num_providers=%d\n", num_providers);
+    if (num_providers != _required_num_providers) {
+        GGMLQNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers);
+        return 4;
+    }
+
+    if (nullptr == provider_list) {
+        GGMLQNN_LOG_WARN("failed to get qnn interface providers\n");
+        return 5;
+    }
+    bool found_valid_interface = false;
+    QNN_INTERFACE_VER_TYPE qnn_interface;
+    for (size_t idx = 0; idx < num_providers; idx++) {
+        if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major &&
+            QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) {
+            found_valid_interface = true;
+            qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME;
+            break;
+        }
+    }
+
+    if (!found_valid_interface) {
+        GGMLQNN_LOG_WARN("unable to find a valid qnn interface\n");
+        return 6;
+    } else {
+        GGMLQNN_LOG_INFO("find a valid qnn interface\n");
+    }
+    set_qnn_raw_interface(qnn_interface);
+
+    BackendIdType backend_id = provider_list[0]->backendId;
+    _lib_path_to_backend_id[lib_path] = backend_id;
+    if (_loaded_backend.count(backend_id) > 0) {
+        GGMLQNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n",
+              lib_path.c_str(), backend_id);
+    }
+    _loaded_backend[backend_id] = provider_list[0];
+    if (_loaded_lib_handle.count(backend_id) > 0) {
+        GGMLQNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]);
+        int dlclose_error = dlclose(_loaded_lib_handle[backend_id]);
+        if (dlclose_error != 0) {
+            GGMLQNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror());
+        }
+    }
+    _loaded_lib_handle[backend_id] = lib_handle;
+    _backend_id = backend_id;
+
+#if 0 // keep them here for further use
+    QnnSaver_Config_t outputdir_cfg;
+    outputdir_cfg.option = QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY;
+    outputdir_cfg.outputDirectory = "/data/local/tmp/";
+    QnnSaver_Config_t backendid_cfg;
+    backendid_cfg.option = QNN_SAVER_CONFIG_OPTION_BACKEND_ID;
+    backendid_cfg.backendId = _backend_id;
+    const QnnSaver_Config_t *saverCfg[] = {&outputdir_cfg, &backendid_cfg, nullptr};
+    if (0 == QnnSaver_initialize(saverCfg)) {
+        GGMLQNN_LOG_INFO("QnnSaver_initialize successfully");
+    } else {
+        GGMLQNN_LOG_WARN("QnnSaver_initialize failure");
+    }
+#endif
+    auto saver_initialize =
+            load_qnn_functionpointers<_pfn_QnnSaver_initialize *>(
+            _loaded_lib_handle[backend_id], "QnnSaver_initialize");
+    if (nullptr != saver_initialize) {
+        error = saver_initialize(saver_config);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to saver_initialize，error %d", QNN_GET_ERROR_CODE(error));
+            return 7;
+        }
+    } else {
+        GGMLQNN_LOG_WARN("saver_initialize is null\n");
+    }
+
+    return 0;
+}
+
+int qnn_instance::unload_backend() {
+    int dlclose_error = 0;
+    for (auto &it : _loaded_lib_handle) {
+        dlclose_error = dlclose(it.second);
+        if (dlclose_error != 0) {
+            GGMLQNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror());
+        }
+    }
+
+    _loaded_lib_handle.clear();
+    _lib_path_to_backend_id.clear();
+    _loaded_backend.clear();
+
+    return 0;
+}
+
+int qnn_instance::load_system() {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+
+    std::string system_lib_path = _lib_path + "libQnnSystem.so";
+    GGMLQNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str());
+
+    _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+    if (nullptr == _system_lib_handle) {
+        GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
+        //re-try with default path of QNN binary runtime lib
+        _lib_path = "/data/local/tmp/";
+        system_lib_path = _lib_path + "libQnnSystem.so";
+        _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+        if (nullptr == _system_lib_handle) {
+            GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
+            return 1;
+        }
+    }
+
+    auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym(
+            _system_lib_handle, "QnnSystemInterface_getProviders"));
+    if (nullptr == get_providers) {
+        GGMLQNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror());
+        return 2;
+    }
+
+    uint32_t num_providers = 0;
+    const QnnSystemInterface_t ** provider_list = nullptr;
+    error = get_providers(&provider_list, &num_providers);
+    if (error != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error));
+        return 3;
+    }
+
+    if (num_providers != _required_num_providers) {
+        GGMLQNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers);
+        return 4;
+    }
+
+    if (nullptr == provider_list) {
+        GGMLQNN_LOG_WARN("can not get providers\n");
+        return 5;
+    }
+
+    QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface;
+    bool found_valid_system_interface = false;
+    for (size_t idx = 0; idx < num_providers; idx++) {
+        if (QNN_SYSTEM_API_VERSION_MAJOR ==
+            provider_list[idx]->systemApiVersion.major &&
+            QNN_SYSTEM_API_VERSION_MINOR <=
+            provider_list[idx]->systemApiVersion.minor) {
+            found_valid_system_interface = true;
+            qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME;
+            break;
+        }
+    }
+    if (!found_valid_system_interface) {
+        GGMLQNN_LOG_WARN("unable to find a valid qnn system interface\n");
+        return 6;
+    } else {
+        GGMLQNN_LOG_INFO("find a valid qnn system interface\n");
+    }
+    set_qnn_raw_system_interface(qnn_system_interface);
+
+    _qnn_interface.set_qnn_system_interface(provider_list[0]);
+
+    _qnn_interface.qnn_system_context_create(&_qnn_system_handle);
+    if (nullptr == _qnn_system_handle) {
+        GGMLQNN_LOG_WARN("can not create QNN system contenxt\n");
+    } else {
+        GGMLQNN_LOG_INFO("initialize qnn system successfully\n");
+    }
+
+    return 0;
+}
+
+int qnn_instance::unload_system() {
+    int result = 0;
+
+    if (nullptr == _system_lib_handle) {
+        GGMLQNN_LOG_DEBUG("system lib handle is null\n");
+        return 1;
+    }
+
+    if (nullptr != _qnn_system_handle) {
+        result = _qnn_interface.qnn_system_context_free(_qnn_system_handle);
+        if (result != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to free QNN system context\n");
+        }
+        _qnn_system_handle = nullptr;
+    }
+
+    int dlclose_error = dlclose(_system_lib_handle);
+    if (dlclose_error != 0) {
+        GGMLQNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror());
+        return 2;
+    }
+
+    _system_lib_handle = nullptr;
+
+    return result;
+}
+
+static void ggml_qnn_logcallback(const char * fmt,
+                                 QnnLog_Level_t level,
+                                 uint64_t timestamp,
+                                 va_list argp) {
+
+    static std::mutex log_mutex;
+    static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN];
+
+    const char * log_level_desc = "";
+    switch (level) {
+        case QNN_LOG_LEVEL_ERROR:
+            log_level_desc = " ERROR ";
+            break;
+        case QNN_LOG_LEVEL_WARN:
+            log_level_desc = "WARNING";
+            break;
+        case QNN_LOG_LEVEL_INFO:
+            log_level_desc = "  INFO ";
+            break;
+        case QNN_LOG_LEVEL_DEBUG:
+            log_level_desc = " DEBUG ";
+            break;
+        case QNN_LOG_LEVEL_VERBOSE:
+            log_level_desc = "VERBOSE";
+            break;
+        case QNN_LOG_LEVEL_MAX:
+            log_level_desc = "UNKNOWN";
+            break;
+    }
+
+    double ms = (double) timestamp / 1000000.0;
+
+    {
+        std::lock_guard<std::mutex> lock(log_mutex);
+
+        memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN);
+        vsnprintf(reinterpret_cast<char *const>(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp);
+#if GGMLQNN_PRINT_QNN_INTERNAL_LOG
+        GGMLQNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
+#endif
+    }
+}
+
+int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
+    BackendIdType backend_id = QNN_BACKEND_ID_NULL;
+    GGMLQNN_LOG_DEBUG("enter qni_init\n");
+
+    const std::lock_guard<std::mutex> lock(_init_mutex);
+
+    if (0 != load_system()) {
+        GGMLQNN_LOG_WARN("can not load QNN system lib, pls check why?\n");
+        return 1;
+    } else {
+        GGMLQNN_LOG_DEBUG("load QNN system lib successfully\n");
+    }
+
+    std::string bakend_lib_path = _lib_path + _backend_name;
+    if (0 == _lib_path_to_backend_id.count(bakend_lib_path)) {
+        int is_load_ok = load_backend(bakend_lib_path, saver_config);
+        if (0 != is_load_ok) {
+            GGMLQNN_LOG_WARN("failed to load QNN backend\n");
+            return 2;
+        }
+    }
+
+    backend_id = _lib_path_to_backend_id[bakend_lib_path];
+    if (0 == _loaded_backend.count(backend_id) ||
+        0 == _loaded_lib_handle.count(backend_id)) {
+        GGMLQNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, loaded lib_handle count=%zu\n",
+              bakend_lib_path.c_str(),
+              _loaded_backend.count(backend_id),
+              _loaded_lib_handle.count(backend_id));
+        return 3;
+    }
+
+    _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]);
+
+#if 1
+    _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle);
+#else
+    _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle);
+#endif
+    if (nullptr == _qnn_log_handle) {
+        GGMLQNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone
+        return 4;
+    } else {
+        GGMLQNN_LOG_DEBUG("initialize qnn log successfully\n");
+    }
+
+    std::vector<const QnnBackend_Config_t *> temp_backend_config;
+    _qnn_interface.qnn_backend_create(_qnn_log_handle,
+                      temp_backend_config.empty() ? nullptr : temp_backend_config.data(),
+                      &_qnn_backend_handle);
+    if (nullptr == _qnn_backend_handle) {
+        GGMLQNN_LOG_WARN("why failed to initialize qnn backend\n");
+        return 5;
+    } else {
+        GGMLQNN_LOG_DEBUG("initialize qnn backend successfully\n");
+    }
+
+    if (nullptr != _qnn_raw_interface.propertyHasCapability) {
+        auto qnnstatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE);
+        if (QNN_PROPERTY_NOT_SUPPORTED == qnnstatus) {
+            GGMLQNN_LOG_WARN("device property is not supported\n");
+        }
+        if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnstatus) {
+            GGMLQNN_LOG_WARN("device property is not known to backend\n");
+        }
+    }
+
+    auto qnnstatus = _qnn_raw_interface.deviceCreate(
+            _qnn_log_handle, nullptr, &_qnn_device_handle);
+    if (QNN_SUCCESS != qnnstatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnstatus) {
+        GGMLQNN_LOG_WARN("failed to create QNN device\n");
+    } else {
+        GGMLQNN_LOG_INFO("create device successfully\n");
+    }
+
+    if (ggml_qnn_profile_level::profile_off != _profile_level) {
+        GGMLQNN_LOG_INFO("profiling turned on; level = %d", _profile_level);
+        if (ggml_qnn_profile_level::profile_basic == _profile_level) {
+            GGMLQNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n");
+            if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
+                    _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) {
+                GGMLQNN_LOG_WARN("unable to create profile handle in the backend\n");
+                return 7;
+            } else {
+                GGMLQNN_LOG_DEBUG("initialize qnn profile successfully\n");
+            }
+        } else if (ggml_qnn_profile_level::profile_detail == _profile_level) {
+            GGMLQNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n");
+            if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
+                    _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) {
+                GGMLQNN_LOG_WARN("unable to create profile handle in the backend\n");
+                return 7;
+            } else {
+                GGMLQNN_LOG_DEBUG("initialize qnn profile successfully\n");
+            }
+        }
+    }
+
+    _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+    if (nullptr == _rpc_lib_handle) {
+        GGMLQNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror());
+        return 9;
+    } else {
+        GGMLQNN_LOG_DEBUG("load rpcmem lib successfully\n");
+        set_rpcmem_initialized(true);
+    }
+    _pfn_rpc_mem_init   = reinterpret_cast<pfn_rpc_mem_init>(dlsym(_rpc_lib_handle, "rpcmem_init"));
+    _pfn_rpc_mem_deinit = reinterpret_cast<pfn_rpc_mem_deinit>(dlsym(_rpc_lib_handle, "rpcmem_deinit"));
+    _pfn_rpc_mem_alloc  = reinterpret_cast<pfn_rpc_mem_alloc>(dlsym(_rpc_lib_handle,"rpcmem_alloc"));
+    _pfn_rpc_mem_free   = reinterpret_cast<pfn_rpc_mem_free>(dlsym(_rpc_lib_handle, "rpcmem_free"));
+    _pfn_rpc_mem_to_fd  = reinterpret_cast<pfn_rpc_mem_to_fd>(dlsym(_rpc_lib_handle,"rpcmem_to_fd"));
+    if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free
+        || nullptr == _pfn_rpc_mem_to_fd) {
+        GGMLQNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror());
+        dlclose(_rpc_lib_handle);
+        return 10;
+    }
+
+    if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy
+        _pfn_rpc_mem_init();
+
+    std::vector<const QnnContext_Config_t *> temp_context_config;
+    _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle,
+                               temp_context_config.empty() ? nullptr : temp_context_config.data(),
+                               &_qnn_context_handle);
+    if (nullptr == _qnn_context_handle) {
+        GGMLQNN_LOG_WARN("why failed to initialize qnn context, error:%s\n", strerror(errno));
+        return 8;
+    } else {
+        GGMLQNN_LOG_DEBUG("initialize qnn context successfully\n");
+    }
+
+    if (_backend_name.find("Htp") != std::variant_npos) {
+        const QnnDevice_PlatformInfo_t * p_info = nullptr;
+        _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
+        GGMLQNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices);
+        QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
+        for (int i = 0; i < p_info->v1.numHwDevices; i++) {
+            GGMLQNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId,
+                         infos[i].v1.deviceType, infos[i].v1.numCores);
+            QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
+            QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice;
+            QnnHtpDevice_Arch_t htp_arch = chipinfo.arch;
+            GGMLQNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType,
+                             (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "QNN_HTP_DEVICE_TYPE_ON_CHIP" : "QNN_HTP_DEVICE_TYPE_UNKNOWN");
+            GGMLQNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \
+                             chipinfo.socModel, qnn_get_socmodel_desc(chipinfo.socModel), \
+                             htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize);
+            struct qcom_socinfo * socinfo = qnn_get_socinfo_from_socmodel(chipinfo.socModel);
+            g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize };
+            if (nullptr != socinfo) {
+                memcpy(g_qnn_mgr[QNN_BACKEND_NPU].socinfo.soc_desc, socinfo->soc_desc, sizeof(socinfo->soc_desc));
+                GGMLQNN_LOG_INFO("soc info:%s", socinfo->soc_desc);
+            } else {
+                memcpy(g_qnn_mgr[QNN_BACKEND_NPU].socinfo.soc_desc, "unknown", 7);
+                GGMLQNN_LOG_INFO("soc info:unknown");
+            }
+        }
+        _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
+
+        size_t candidate_size = 0;
+        uint8_t * rpc_buffer = nullptr;
+        const int SIZE_IN_MB = (1 << 20);
+        size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
+        size_t probe_counts  = sizeof(probe_slots) / sizeof(size_t);
+        for (size_t idx = 0; idx < probe_counts; idx++) {
+            rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4));
+            if (nullptr == rpc_buffer) {
+                GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
+                break;
+            } else {
+                candidate_size = probe_slots[idx];
+                free_rpcmem(rpc_buffer);
+                rpc_buffer = nullptr;
+            }
+        }
+        if (candidate_size > _rpcmem_capacity)
+            _rpcmem_capacity = candidate_size;
+        GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
+
+        if (0 != init_htp_perfinfra()) {
+            GGMLQNN_LOG_WARN("initialize HTP performance failure");
+        }
+        if (0 != set_rpc_polling()) {
+            GGMLQNN_LOG_WARN("set RPC polling failure");
+        }
+        if (0 != set_high_performance_mode()) {
+            GGMLQNN_LOG_WARN("set HTP high performance mode failure");
+        }
+    }
+
+    GGMLQNN_LOG_DEBUG("leave qni_init\n");
+
+    return 0;
+}
+
+int qnn_instance::qnn_finalize() {
+    int ret_status = 0;
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+
+    GGMLQNN_LOG_DEBUG("enter %s\n", __func__);
+    //FIXME:should be removed in the future
+    reset_idx();
+
+    free_rpcmem();
+    unregister_rpcmem();
+
+    if (nullptr != _pfn_rpc_mem_deinit)
+        _pfn_rpc_mem_deinit();
+
+    if (dlclose(_rpc_lib_handle) != 0) {
+        GGMLQNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror());
+    } else {
+        GGMLQNN_LOG_DEBUG("succeed to close rpcmem lib\n");
+    }
+
+    if (nullptr != _qnn_context_handle) {
+        error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+
+        }
+        _qnn_context_handle = nullptr;
+    }
+
+    if (nullptr != _qnn_profile_handle) {
+        error = _qnn_interface.qnn_profile_free(_qnn_profile_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+
+        }
+        _qnn_profile_handle = nullptr;
+    }
+
+    if (nullptr != _qnn_device_handle) {
+        error = _qnn_interface.qnn_device_free(_qnn_device_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+
+        }
+        _qnn_device_handle = nullptr;
+    }
+
+    if (nullptr != _qnn_backend_handle) {
+        error = _qnn_interface.qnn_backend_free(_qnn_backend_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+        }
+        _qnn_backend_handle = nullptr;
+
+    }
+
+    if (nullptr != _qnn_log_handle) {
+        error = _qnn_interface.qnn_log_free(_qnn_log_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLQNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+        }
+        _qnn_log_handle = nullptr;
+    }
+
+    unload_backend();
+
+    unload_system();
+    GGMLQNN_LOG_DEBUG("leave %s\n", __func__);
+
+    return ret_status;
+}
+
+int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb) {
+    _graph_name = graph_name;
+    _device_id = device;
+
+    GGMLQNN_LOG_DEBUG("[%s][%s]created", ggml_backend_qnn_get_devname(device), graph_name.c_str());
+
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    Qnn_GraphHandle_t graph_handle = nullptr;
+    if (device == QNN_BACKEND_NPU) {
+        QnnHtpGraph_CustomConfig_t hvx_config;
+        hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
+        hvx_config.numHvxThreads = 8;
+        QnnGraph_Config_t graph_hvx_config;
+        graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_hvx_config.customConfig = &hvx_config;
+
+        QnnHtpGraph_CustomConfig_t dlbc_config;
+        dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+        dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
+        dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
+        QnnGraph_Config_t graph_dlbc_config;
+        graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_dlbc_config.customConfig = &dlbc_config;
+
+        QnnHtpGraph_CustomConfig_t opt_config;
+        opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
+        opt_config.optimizationOption.floatValue = 1; // 1 / 3
+        QnnGraph_Config_t graph_opt_config;
+        graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_opt_config.customConfig = &opt_config;
+
+        QnnHtpGraph_CustomConfig_t vtcm_config;
+        vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
+        vtcm_config.vtcmSizeInMB = vtcm_size_in_mb;
+        QnnGraph_Config_t graph_vtcm_config;
+        graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_vtcm_config.customConfig = &vtcm_config;
+
+        const QnnGraph_Config_t * graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config,
+                                                    &graph_opt_config, nullptr};
+        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), graph_configs, &graph_handle);
+    } else {
+        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), nullptr, &graph_handle);
+    }
+
+    if (error != QNN_SUCCESS) {
+        GGMLQNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s",
+                      ggml_backend_qnn_get_devname(device), graph_name.c_str(),
+                      qnn_get_error_string(error));
+        return error;
+    }
+
+    GGMLQNN_LOG_INFO("[%s]create graph %s succeed", ggml_backend_qnn_get_devname(device), graph_name.c_str());
+    _qnn_graph_handle = graph_handle;
+    return QNN_SUCCESS;
+}
+
+int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation,
+                                   const QnnGraph_Config_t ** graph_configs) {
+    int result = 0;
+
+    if (nullptr == graph_name) {
+        GGMLQNN_LOG_WARN("graph name is null\n");
+        return 1;
+    }
+
+    if (!_graph_name.empty()) {
+        GGMLQNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name);
+        return 2;
+    }
+
+    if (!do_node_validation) {
+        GGMLQNN_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n");
+    }
+
+    _graph_name = graph_name;
+    _debug_tensor = debug;
+    _do_node_validations = do_node_validation;
+
+    result = _qnn_raw_interface.graphCreate(_qnn_context_handle,
+                                            graph_name,
+                                            graph_configs,
+                                            &_qnn_graph_handle);
+    if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) {
+        GGMLQNN_LOG_WARN("failed to create graph in qnn context\n");
+        return 3;
+    } else {
+        GGMLQNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle);
+    }
+
+    return 0;
+}
+
+int qnn_instance::finalize_qnn_graph() {
+    if (nullptr != _qnn_graph_handle) {
+        if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle,
+                                             _qnn_profile_handle, nullptr)
+                                             != QNN_GRAPH_NO_ERROR) {
+            GGMLQNN_LOG_WARN("finalizing graph failure\n");
+            return 1;
+        }
+    } else {
+        GGMLQNN_LOG_DEBUG("qnn graph handle is null\n");
+    }
+
+    return 0;
+}
+
+static uint8_t * create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) {
+    if (nullptr == instance || nullptr == ggml_tensor || nullptr == qnn_tensor) {
+        GGMLQNN_LOG_WARN("invalid params\n");
+        return nullptr;
+    }
+
+    uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(ggml_tensor), 4));
+    if (nullptr == qnn_rpcbuffer) {
+        GGMLQNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
+        return nullptr;
+    } else {
+        GGMLQNN_LOG_DEBUG("alloc rpcmem %p successfully\n", qnn_rpcbuffer);
+    }
+    if (b_copydata)
+        memcpy(qnn_rpcbuffer, ggml_tensor->data, ggml_nbytes(ggml_tensor));
+    instance->register_rpcmem(qnn_rpcbuffer, qnn_tensor);
+    return qnn_rpcbuffer;
+}
+
+static Qnn_ErrorHandle_t create_htp_graph(ggml_backend_qnn_context * ctx, const std::string & graph_name, Qnn_GraphHandle_t * graph_handle) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    if (nullptr == ctx)
+        return QNN_MIN_ERROR_COMMON;
+
+    qnn_instance * instance = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+
+    QnnHtpGraph_CustomConfig_t hvx_config;
+    hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
+    hvx_config.numHvxThreads = 4;
+    QnnGraph_Config_t graph_hvx_config;
+    graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+    graph_hvx_config.customConfig = &hvx_config;
+
+    QnnHtpGraph_CustomConfig_t dlbc_config;
+    dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+    dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
+    dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
+    QnnGraph_Config_t graph_dlbc_config;
+    graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+    graph_dlbc_config.customConfig = &dlbc_config;
+
+    QnnHtpGraph_CustomConfig_t opt_config;
+    opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
+    opt_config.optimizationOption.floatValue = 3;    // 1 or 3
+    QnnGraph_Config_t graph_opt_config;
+    graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+    graph_opt_config.customConfig = &opt_config;
+
+    QnnHtpGraph_CustomConfig_t vtcm_config;
+    vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
+    vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
+    QnnGraph_Config_t graph_vtcm_config;
+    graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+    graph_vtcm_config.customConfig = &vtcm_config;
+
+    QnnHtpGraph_CustomConfig_t precision_config;
+    precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
+    precision_config.precision = QNN_PRECISION_FLOAT16;
+    QnnGraph_Config_t graph_precision_config;
+    graph_precision_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+    graph_precision_config.customConfig = &precision_config;
+
+    const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config,
+                                                 &graph_dlbc_config,
+                                                 &graph_vtcm_config,
+                                                 &graph_opt_config,
+                                                 &graph_precision_config,
+                                                 NULL};
+    error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
+                                          graph_name.c_str(),
+                                          p_graphconfig, graph_handle);
+    return error;
+}
+
+static void print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    //skip sanity check of params
+    GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
+    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                      src0->name,
+                      src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
+                      src0->nb[0], src0->nb[1], src0->nb[2]);
+    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                      src1->name,
+                      src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
+                      src1->nb[0], src1->nb[1], src1->nb[2]);
+    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                      dst->name,
+                      dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
+                      dst->nb[1], dst->nb[2]);
+    GGMLQNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
+    GGMLQNN_LOG_DEBUG("tensor0 name %s", src0->name);
+    GGMLQNN_LOG_DEBUG("tensor1 name %s", src1->name);
+    GGMLQNN_LOG_DEBUG("tensor2 name %s", dst->name);
+}
+
+static void dump_tensors_info(const struct ggml_tensor * tensor) {
+    //skip sanity check of params
+    struct ggml_tensor * src0 = tensor->src[0];
+    struct ggml_tensor * src1 = tensor->src[1];
+    GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op),
+                      ggml_type_name(tensor->type));
+    GGMLQNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type));
+    GGMLQNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type));
+    GGMLQNN_LOG_DEBUG(
+            "src0 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+            src0->name,
+            src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
+            src0->nb[0], src0->nb[1], src0->nb[2]);
+    GGMLQNN_LOG_DEBUG(
+            "src1 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+            src1->name,
+            src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
+            src1->nb[0], src1->nb[1], src1->nb[2]);
+    GGMLQNN_LOG_DEBUG(
+            "     %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+            tensor->name,
+            tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1],
+            tensor->ne[2],
+            tensor->nb[0],
+            tensor->nb[1], tensor->nb[2]);
+}
+
+// =================================================================================================
+//  section-6: implementation of ggml-qnn backend
+// =================================================================================================
+static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
+    if (tensor->op == GGML_OP_NONE) {
+        return true;
+    }
+    if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE
+    || tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW
+    || tensor->op == GGML_OP_PERMUTE) {
+        return false;
+    }
+
+    bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT));
+    if (!supported_op) {
+        return false;
+    }
+
+    struct ggml_tensor * src0 = tensor->src[0];
+    struct ggml_tensor * src1 = tensor->src[1];
+
+    int64_t ne00 = tensor->src[0]->ne[0];
+    int64_t ne01 = tensor->src[0]->ne[1];
+
+    int64_t ne10 = tensor->src[1]->ne[0];
+    int64_t ne11 = tensor->src[1]->ne[1];
+
+    int64_t ne0 = tensor->ne[0];
+    int64_t ne1 = tensor->ne[1];
+
+    if (tensor->op == GGML_OP_ADD) {
+        if (!ggml_are_same_shape(src0, src1)) {
+            return false;
+        }
+
+        if (ne00 < 32)
+            return false;
+        
+#if GGMLQNN_PRINT_OP_ADD_LOG
+        dump_tensors_info(tensor);
+#endif
+        return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
+               && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
+
+    }
+
+    if (tensor->op == GGML_OP_MUL_MAT) {
+#if GGMLQNN_PRINT_OP_MUL_MAT_LOG
+        dump_tensors_info(tensor);
+#endif
+        //FIXME: 2048 is an experimental value between ASR inference and LLM inference because
+        //       it's better only offload big matrix to QNN backend
+        if (ne01 <= 2048) {
+            return false;
+        }
+#if 0
+        //TODO: offload mul_mat to QNN backend
+        //need to process type trait in func ggml_qnn_mul_mat(...):
+        //src0: q4_0, q6_k, ...
+        //src1: f32
+        //dst : f32
+        return (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
+                && (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16);
+#else
+        //fall back to ggml cpu backend
+        return  (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
+                && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
+                && (src0->type == src1->type) && (src0->type == tensor->type);
+#endif
+    }
+
+    //TODO:for other op
+    return  (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
+            && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
+            && (src0->type == src1->type) && (src0->type == tensor->type);
+}
+
+static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
+    enum ggml_status result                     = GGML_STATUS_SUCCESS;
+    bool graph_initialized                      = false;
+    qnn_instance * instance                     = nullptr;
+    ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *)backend->context;
+    std::string graph_name                      = "ggml_op_qnn_add";
+    qnn_perf op_perf                            = qnn_perf("ggml_qnn_add");
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    Qnn_Tensor_t * tensor_0                     = nullptr;
+    Qnn_Tensor_t * tensor_1                     = nullptr;
+    Qnn_Tensor_t * tensor_2                     = nullptr;
+    Qnn_Param_t qnn_params[]                    = {};
+    enum ggml_op ggmlop                         = GGML_OP_ADD;
+    Qnn_DataType_t src0_qnn_type                = QNN_DATATYPE_FLOAT_32;
+    Qnn_DataType_t src1_qnn_type                = QNN_DATATYPE_FLOAT_32;
+    Qnn_DataType_t dst_qnn_type                 = QNN_DATATYPE_FLOAT_32;
+    const ggml_tensor * src0                    = op->src[0];
+    const ggml_tensor * src1                    = op->src[1];
+    ggml_tensor * dst                           = op;
+
+    uint8_t * qnn_rpcbuffer_0                   = nullptr;
+    uint8_t * qnn_rpcbuffer_1                   = nullptr;
+    uint8_t * qnn_rpcbuffer_2                   = nullptr;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+
+    op_perf.start();
+
+    std::string map_entry;
+    get_graph_key_from_op(op, map_entry);
+    if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) {
+        graph_initialized = true;
+        auto & graph_item = instance->_qnn_graph_map[map_entry];
+        graph_handle = std::get<0>(graph_item);
+        tensor_0     = std::get<1>(graph_item);
+        tensor_1     = std::get<2>(graph_item);
+        tensor_2     = std::get<3>(graph_item);
+    } else {
+        tensor_0 = ggml_qnn_create_tensor(src0);
+        tensor_1 = ggml_qnn_create_tensor(src1);
+        tensor_2 = ggml_qnn_create_tensor(dst);
+    }
+
+    print_tensors_info(__func__, ctx, src0, src1, dst);
+
+    QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
+
+    src0_qnn_type                   = qnn_datatype_from_ggml_datatype(src0->type);
+    src1_qnn_type                   = qnn_datatype_from_ggml_datatype(src1->type);
+    dst_qnn_type                    = qnn_datatype_from_ggml_datatype(dst->type);
+
+    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions;
+    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
+    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
+
+    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
+
+    if (!graph_initialized) {
+        graph_name = map_entry;
+        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
+        if (ctx->device == QNN_BACKEND_NPU) {
+            error = create_htp_graph(ctx, graph_name, &graph_handle);
+        } else {
+            error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
+                                                  graph_name.c_str(),
+                                                  nullptr, &graph_handle);
+        }
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
+            return;
+        }
+
+        if (enable_npu_rpc) {
+            QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*tensor_0)->clientBuf = {.data=nullptr, .dataSize=0};
+
+            QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*tensor_1)->clientBuf = {.data=nullptr, .dataSize=0};
+
+            QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*tensor_2)->clientBuf = {.data=nullptr, .dataSize=0};
+        }
+
+        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0));
+        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1));
+        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2));
+
+        if (enable_npu_rpc) {
+            qnn_rpcbuffer_0 = create_rpc_buffer(instance, src0, tensor_0, true);
+            qnn_rpcbuffer_1 = create_rpc_buffer(instance, src1, tensor_1, true);
+            qnn_rpcbuffer_2 = create_rpc_buffer(instance, dst, tensor_2, false);
+            if (nullptr == qnn_rpcbuffer_0 || nullptr == qnn_rpcbuffer_1 || nullptr == qnn_rpcbuffer_2) {
+                GGMLQNN_LOG_INFO("create rpc buffer failure\n");
+                //FIXME: potential memory leak althought it shouldn't happen
+                return;
+            }
+        } else {
+            QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+            QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+            QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+        }
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *tensor_0,
+                *tensor_1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *tensor_2
+        };
+        Qnn_OpConfig_t op_config = {
+                (Qnn_OpConfigVersion_t) 1, .v1 = {
+                        "ggml_op_add",
+                        QNN_OP_PACKAGE_NAME_QTI_AISW,
+                        QNN_OP_ELEMENT_WISE_ADD,
+                        0,
+                        qnn_params,
+                        2,
+                        tensor_inputs,
+                        1,
+                        tensor_outputs
+                }
+        };
+        CHECK_QNN_API(error = qnn_raw_interface.graphAddNode(graph_handle, op_config));
+        CHECK_QNN_API(error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+        error = qnn_raw_interface.graphExecute(graph_handle,
+                                               tensor_inputs, 2,
+                                               tensor_outputs, 1,
+                                               nullptr, nullptr);
+        CHECK_QNN_API(error);
+
+        if (enable_npu_rpc) {
+            uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_2)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer = %p\n", qnn_rpcbuffer);
+            if (nullptr != qnn_rpcbuffer) {
+                memcpy(dst->data, qnn_rpcbuffer, ggml_nbytes(dst));
+            }
+        }
+
+        auto  graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
+        instance->_qnn_graph_map[map_entry] = graph_item;
+
+    } else {
+
+        uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+                                         (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
+        uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
+                                         (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
+        uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+                                         (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
+
+        QNN_VER_PTR(*tensor_0)->dimensions  = dimensions_input_0;
+        QNN_VER_PTR(*tensor_0)->rank        = ggml_get_tensor_rank(src0);
+        QNN_VER_PTR(*tensor_0)->dataType    = src0_qnn_type;
+
+        QNN_VER_PTR(*tensor_1)->dimensions  = dimensions_input_1;
+        QNN_VER_PTR(*tensor_1)->rank        = ggml_get_tensor_rank(src1);
+        QNN_VER_PTR(*tensor_1)->dataType    = src1_qnn_type;
+
+        QNN_VER_PTR(*tensor_2)->dimensions  = dimensions_output;
+        QNN_VER_PTR(*tensor_2)->rank        = ggml_get_tensor_rank(dst);
+        QNN_VER_PTR(*tensor_2)->dataType    = dst_qnn_type;
+
+        if (enable_npu_rpc) {
+            //FIXME:why failure with test-backend-ops
+            uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_0)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_rpcbuffer_0);
+            if (nullptr != qnn_buffer_0) {
+                memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
+            }
+
+            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_1)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_rpcbuffer_1);
+            if (nullptr != qnn_buffer_1) {
+                memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
+            }
+        } else {
+            QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+            QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+            QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+        }
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *tensor_0,
+                *tensor_1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *tensor_2
+        };
+        error = qnn_raw_interface.graphExecute(graph_handle,
+                                               tensor_inputs, 2,
+                                               tensor_outputs, 1,
+                                               nullptr, nullptr);
+        CHECK_QNN_API(error);
+
+        if (enable_npu_rpc) {
+            //FIXME:why failure with test-backend-ops
+            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_2)->memHandle));
+            if (nullptr != qnn_buffer_2) {
+                memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
+            }
+        }
+    }
+
+    //avoid memory leak in func free_qnn_tensor
+    QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions;
+    QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
+    QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions;
+#if GGMLQNN_PRINT_OP_ADD_LOG
+    op_perf.info();
+#endif
+}
+
+//TODO:
+/*
+ * the logic of ggml_qnn_mul_mat is similar to ggml_qnn_add,but type trait and matrix transpose are required
+ * for offload mulmat to QNN backend, so it's a standalone function.
+ *
+ * MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, we should focus on MUL_MAT.
+ *
+ * we have three kinds of MUL_MAT to compute:
+ * mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
+ * mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
+ * mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, ...) and src1 is F32, src0 -> f32 in src0', then src0' * src1
+*/
+static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
+    bool graph_initialized                      = false;
+    qnn_perf op_perf                            = qnn_perf("ggml_qnn_mul_mat");
+    qnn_instance * instance                     = nullptr;
+    ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *) backend->context;
+
+    std::string graph_name                      = "ggml_op_qnn_mul_mat";
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    Qnn_Tensor_t * tensor_0                     = nullptr;
+    Qnn_Tensor_t * tensor_1                     = nullptr;
+    Qnn_Tensor_t * tensor_2                     = nullptr;
+
+    Qnn_Param_t qnn_params[]                    = {};
+
+    enum ggml_op ggmlop                         = GGML_OP_ADD;
+    Qnn_DataType_t src0_qnn_type                = QNN_DATATYPE_FLOAT_32;
+    Qnn_DataType_t src1_qnn_type                = QNN_DATATYPE_FLOAT_32;
+    Qnn_DataType_t dst_qnn_type                 = QNN_DATATYPE_FLOAT_32;
+    const ggml_tensor * src0                    = op->src[0];
+    const ggml_tensor * src1                    = op->src[1];
+    ggml_tensor * dst                           = op;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+
+    op_perf.start();
+
+    std::string map_entry;
+    get_graph_key_from_op(op, map_entry);
+    if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) {
+        graph_initialized = true;
+        auto & graph_item = instance->_qnn_graph_map[map_entry];
+        graph_handle = std::get<0>(graph_item);
+        tensor_0     = std::get<1>(graph_item);
+        tensor_1     = std::get<2>(graph_item);
+        tensor_2     = std::get<3>(graph_item);
+    } else {
+        tensor_0 = ggml_qnn_create_tensor(src0);
+        tensor_1 = ggml_qnn_create_tensor(src1);
+        tensor_2 = ggml_qnn_create_tensor(dst);
+    }
+
+    print_tensors_info(__func__, ctx, src0, src1, dst);
+
+    QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
+
+    src0_qnn_type                   = qnn_datatype_from_ggml_datatype(src0->type);
+    src1_qnn_type                   = qnn_datatype_from_ggml_datatype(src1->type);
+    dst_qnn_type                    = qnn_datatype_from_ggml_datatype(dst->type);
+
+    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions;
+    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
+    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
+
+    if (!graph_initialized) {
+        graph_name = map_entry;
+        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
+        error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
+                                              graph_name.c_str(), nullptr, &graph_handle);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
+            return;
+        }
+        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0));
+        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1));
+        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2));
+
+        QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+        QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *tensor_0,
+                *tensor_1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *tensor_2
+        };
+        Qnn_OpConfig_t op_config = {
+                (Qnn_OpConfigVersion_t) 1, .v1 = {
+                        "ggml_op_mul_mat",
+                        QNN_OP_PACKAGE_NAME_QTI_AISW,
+                        QNN_OP_MAT_MUL,
+                        0,
+                        qnn_params,
+                        2,
+                        tensor_inputs,
+                        1,
+                        tensor_outputs
+                }
+        };
+        CHECK_QNN_API(error = qnn_raw_interface.graphAddNode(graph_handle, op_config));
+        CHECK_QNN_API(error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+        error = qnn_raw_interface.graphExecute(graph_handle,
+                                               tensor_inputs, 2,
+                                               tensor_outputs, 1,
+                                               nullptr, nullptr);
+        CHECK_QNN_API(error);
+        auto  graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
+        instance->_qnn_graph_map[map_entry] = graph_item;
+
+    } else {
+
+        uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+                                         (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
+        uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
+                                         (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
+        uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+                                         (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
+        QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
+        QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0);
+        QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
+
+        QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
+        QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1);
+        QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
+
+        QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
+        QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst);
+        QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
+
+        QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+        QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *tensor_0,
+                *tensor_1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *tensor_2
+        };
+        error = qnn_raw_interface.graphExecute(graph_handle,
+                                              tensor_inputs, 2,
+                                             tensor_outputs, 1,
+                                         nullptr, nullptr);
+        CHECK_QNN_API(error);
+    }
+
+    //avoid memory leak in func free_qnn_tensor
+    QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions;
+    QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
+    QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions;
+
+    op_perf.info();
+}
+
+static bool ggml_qnn_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor) {
+    ggmlqnn_op_func_t func                = nullptr;
+
+    switch (tensor->op) {
+        case GGML_OP_ADD:
+            func = ggml_qnn_add;
+            break;
+
+        case GGML_OP_MUL_MAT:
+            func = ggml_qnn_mul_mat;
+            break;
+
+        default:
+            return false;
+    }
+
+    if (nullptr != func)
+        func(backend, tensor);
+
+    return true;
+}
+
+struct ggml_backend_qnn_buffer_context {
+    ~ggml_backend_qnn_buffer_context() {
+        if (buffer) {
+            free(buffer);
+        }
+
+        for (auto * sub_buffer : sub_buffers) {
+            free(sub_buffer);
+        }
+
+        for (auto * qnn_tensor : qnn_tensors) {
+            free_qnn_tensor(qnn_tensor);
+        }
+
+        sub_buffers.clear();
+        qnn_tensors.clear();
+    }
+    void * buffer       = nullptr;
+
+    struct ggml_backend_qnn_context * backend_ctx = nullptr;
+
+    size_t buffer_size  = 0;
+    std::vector<void *> sub_buffers;
+    std::vector<Qnn_Tensor_t *> qnn_tensors;
+};
+
+static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
+    delete ctx;
+}
+
+static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
+
+    return ctx->buffer;
+}
+
+static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
+    GGML_UNUSED(error);
+    GGML_UNUSED(ctx);
+    return;
+}
+
+static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                               ggml_tensor * tensor, const void * data,
+                                               size_t offset, size_t size) {
+    GGML_UNUSED(buffer);
+
+    memcpy((char *)tensor->data + offset, data, size);
+}
+
+static void ggml_backend_qnn_buffer_memset_tensor(ggml_backend_buffer_t buffer,
+                                                  struct ggml_tensor * tensor,
+                                                  uint8_t value, size_t offset, size_t size) {
+    GGML_UNUSED(buffer);
+    memset((char *)tensor->data + offset, value, size);
+}
+
+static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                               const ggml_tensor * tensor,
+                                               void * data, size_t offset, size_t size) {
+    GGML_UNUSED(buffer);
+    memcpy(data, (const char *)tensor->data + offset, size);
+}
+
+static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
+                                               const struct ggml_tensor * src,
+                                               struct ggml_tensor * dst) {
+    GGML_UNUSED(buffer);
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        memcpy(dst->data, src->data, ggml_nbytes(src));
+        return true;
+    }
+
+    return false;
+}
+
+static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
+    memset(ctx->buffer, value, ctx->buffer_size);
+}
+
+[[maybe_unused]]static void ggml_backend_qnn_buffer_reset(ggml_backend_buffer_t buffer) {
+    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
+    for (auto * sub_buffer : ctx->sub_buffers) {
+        free(sub_buffer);
+    }
+    ctx->sub_buffers.clear();
+}
+
+static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = {
+        /* .free_buffer     = */ ggml_backend_qnn_buffer_free_buffer,
+        /* .get_base        = */ ggml_backend_qnn_buffer_get_base,
+        /* .init_tensor     = */ ggml_backend_qnn_buffer_init_tensor,
+        /* .memset_tensor   = */ ggml_backend_qnn_buffer_memset_tensor,
+        /* .set_tensor      = */ ggml_backend_qnn_buffer_set_tensor,
+        /* .get_tensor      = */ ggml_backend_qnn_buffer_get_tensor,
+        /* .cpy_tensor      = */ ggml_backend_qnn_buffer_cpy_tensor,
+        /* .clear           = */ ggml_backend_qnn_buffer_clear,
+        /* .reset           = */ NULL,
+};
+
+static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    return "qnn-buffer";
+}
+
+static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(
+                                  ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context;
+
+    size_t size_page = sysconf(_SC_PAGESIZE);
+    size_t size_aligned = size;
+    if ((size_aligned % size_page) != 0) {
+        size_aligned += (size_page - (size_aligned % size_page));
+    }
+    ctx->buffer         = ggmlqnn_host_malloc(size_aligned);
+    ctx->buffer_size    = size_aligned;
+    if (nullptr == ctx->buffer) {
+        GGMLQNN_LOG_WARN("%s: failed to allocate %.2f MiB\n", __func__, size / (1 << 20));
+        return nullptr;
+    }
+
+    return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    return 32;
+}
+
+//FIXME: this value is an experimental value on Xiaomi14
+static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+
+    return (2 * (1 << 30));
+}
+
+static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    return true;
+}
+
+static const char * ggml_backend_qnn_name(ggml_backend_t backend) {
+    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context;
+    return g_qnn_mgr[ctx->device].name;
+}
+
+static void ggml_backend_qnn_free(ggml_backend_t backend) {
+    GGMLQNN_LOG_DEBUG("enter %s", __func__ );
+    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context;
+    GGMLQNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name);
+
+    qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance;
+    if (instance != nullptr) {
+        std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *,
+                                        Qnn_Tensor_t *, Qnn_Tensor_t *>>::iterator graph_it;
+
+        for (graph_it = instance->_qnn_graph_map.begin();
+             graph_it != instance->_qnn_graph_map.end(); graph_it++) {
+            auto & graph_item = graph_it->second;
+            Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item);
+            Qnn_Tensor_t *  tensor_0     = std::get<1>(graph_item);
+            Qnn_Tensor_t *  tensor_1     = std::get<2>(graph_item);
+            Qnn_Tensor_t *  tensor_2     = std::get<3>(graph_item);
+            GGML_UNUSED(graph_handle);
+            GGMLQNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str());
+            free_qnn_tensor(tensor_0);
+            free_qnn_tensor(tensor_1);
+            free_qnn_tensor(tensor_2);
+        }
+        instance->_qnn_graph_map.clear();
+
+        instance->qnn_finalize();
+        delete instance;
+        g_qnn_mgr[ctx->device].instance = nullptr;
+    }
+
+    if (g_qnn_mgr[ctx->device].backend != nullptr) {
+        delete backend;
+        g_qnn_mgr[ctx->device].backend = nullptr;
+    }
+    GGMLQNN_LOG_DEBUG("leave %s", __func__ );
+}
+
+static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    enum ggml_status result         = GGML_STATUS_SUCCESS;
+    ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *) backend->context;
+    GGML_UNUSED(ctx);
+
+    GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE
+        || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW
+        || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+            continue;
+        }
+        bool ok = ggml_qnn_compute_forward(backend, node);
+        if (!ok) {
+            GGMLQNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n",
+                              __func__, node->name, ggml_op_name(node->op));
+        }
+    }
+
+    return result;
+}
+
+static const char * ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) {
+    struct ggml_backend_qnn_context *ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
+    if (nullptr == ctx) {
+        GGMLQNN_LOG_ERROR("pls check why ctx is null");
+        return "unknown";
+    }
+    return ctx->name;
+
+    GGML_UNUSED(dev);
+}
+
+static const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) {
+    struct ggml_backend_qnn_context * ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
+    if (nullptr == ctx) {
+        GGMLQNN_LOG_ERROR("pls check why ctx is null");
+        return "unknown";
+    }
+    if (0 == strncmp(ctx->name, "qnn-npu", 7)) {
+        const char * soc_info = qnn_get_socmodel_desc(ctx->socinfo.soc_model);
+        const char * htp_arch = qnn_get_htparch_desc(ctx->socinfo.htp_arch);
+        std::string dev_desc = std::string(ctx->desc)
+                + std::string(soc_info) + "_" + std::string(htp_arch)
+                + "," + std::string(ctx->socinfo.soc_desc);
+        return dev_desc.c_str();
+    } else {
+        return ctx->desc;
+    }
+}
+
+static void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    //FIXME:this is NOT QNN device memory info
+    *free  = get_system_free_memory_in_bytes();
+    *total = get_system_total_memory_in_bytes();
+    GGML_UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+}
+
+static void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev,
+                                              struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_qnn_device_get_name(dev);
+    props->description = ggml_backend_qnn_device_get_description(dev);
+    props->type        = ggml_backend_qnn_device_get_type(dev);
+    ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+            /* .async                 = */ false,
+            /* .host_buffer           = */ false,
+            /* .buffer_from_host_ptr  = */ true,
+            /* .events                = */ false,
+    };
+}
+
+static ggml_backend_t ggml_backend_qnn_device_init_backend(ggml_backend_dev_t dev, const char * params) {
+    GGML_UNUSED(dev);
+    if (nullptr == params) {
+        params = 0;
+    }
+    ggml_backend_t qnn_backend = ggml_backend_qnn_init((int) (intptr_t) params,
+                                                       "/data/local/tmp/");
+
+    return qnn_backend;
+
+}
+
+ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) {
+    if (device_index >= GGML_QNN_MAX_DEVICES) {
+        GGMLQNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n",
+                      device_index, GGML_QNN_MAX_DEVICES - 1);
+        return nullptr;
+    }
+
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_qnn = {
+            /* .iface   = */ {
+                                     /* .get_name         = */ ggml_backend_qnn_buffer_type_name,
+                                     /* .alloc_buffer     = */ ggml_backend_qnn_buffer_type_alloc_buffer,
+                                     /* .get_alignment    = */ ggml_backend_qnn_buffer_type_get_alignment,
+                                     /* .get_max_size     = */ ggml_backend_qnn_buffer_type_get_max_size,
+                                     /* .get_alloc_size   = */ NULL,// defaults to ggml_nbytes
+                                     /* .is_host          = */ ggml_backend_qnn_buffer_is_host
+                             },
+            /* .context = */ NULL,
+    };
+
+    return &ggml_backend_buffer_type_qnn;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_qnn_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) dev->context;
+    return ggml_backend_qnn_buffer_type(ctx->device);
+}
+
+static ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_host_ptr(ggml_backend_dev_t dev,
+                                                void * ptr, size_t size, size_t max_tensor_size) {
+    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
+
+    GGML_UNUSED(dev);
+    GGML_UNUSED(max_tensor_size);
+}
+
+
+static bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) dev->context;
+    return (ggml_qnn_can_handle_op(op));
+}
+
+static bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(dev);
+    return ggml_backend_buft_is_host(buft);
+}
+
+static struct ggml_backend_device_i ggml_backend_qnn_device_interface = {
+        /* .get_name             = */ ggml_backend_qnn_device_get_name,
+        /* .get_description      = */ ggml_backend_qnn_device_get_description,
+        /* .get_memory           = */ ggml_backend_qnn_device_get_memory,
+        /* .get_type             = */ ggml_backend_qnn_device_get_type,
+        /* .get_props            = */ ggml_backend_qnn_device_get_props,
+        /* .init_backend         = */ ggml_backend_qnn_device_init_backend,
+        /* .get_buffer_type      = */ ggml_backend_qnn_device_get_buffer_type,
+        /* .get_host_buffer_type = */ NULL,
+        /* .buffer_from_host_ptr = */ ggml_backend_qnn_device_buffer_from_host_ptr,
+        /* .supports_op          = */ ggml_backend_qnn_device_supports_op,
+        /* .supports_buft        = */ ggml_backend_qnn_device_supports_buft,
+        /* .offload_op           = */ NULL,
+        /* .event_new            = */ NULL,
+        /* .event_free           = */ NULL,
+        /* .event_synchronize    = */ NULL,
+};
+
+static ggml_backend_i ggml_backend_qnn_interface = {
+        /* .get_name                = */ ggml_backend_qnn_name,
+        /* .free                    = */ ggml_backend_qnn_free,
+        /* .set_tensor_async        = */ nullptr,
+        /* .get_tensor_async        = */ nullptr,
+        /* .cpy_tensor_async        = */ nullptr,
+        /* .synchronize             = */ nullptr,
+        /* .graph_plan_create       = */ nullptr,
+        /* .graph_plan_free         = */ nullptr,
+        /* .graph_plan_update       = */ nullptr,
+        /* .graph_plan_compute      = */ nullptr,
+        /* .graph_compute           = */ ggml_backend_qnn_graph_compute,
+        /* .event_record            = */ nullptr,
+        /* .event_wait              = */ nullptr,
+};
+
+//FIXME: this guid is not make sense
+static ggml_guid_t ggml_backend_qnn_guid() {
+    static ggml_guid guid = {
+            0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81,
+            0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09
+    };
+    return &guid;
+}
+
+bool ggml_backend_is_qnn(ggml_backend_t backend) {
+    return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid());
+}
+
+void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_qnn(backend));
+
+    struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context;
+    ctx->threads = n_threads;
+}
+
+int ggml_backend_qnn_get_device_count() {
+    return GGML_QNN_MAX_DEVICES;
+}
+
+struct ggml_backend_qnn_reg_context {
+    std::vector<ggml_backend_dev_t> devices;
+};
+
+static const char * ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) {
+    return "ggml-qnn";
+
+    GGML_UNUSED(reg);
+}
+
+static size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    return GGML_QNN_MAX_DEVICES;
+}
+
+static ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    GGML_UNUSED(reg);
+    GGML_UNUSED(index);
+
+    GGMLQNN_LOG_DEBUG("index %d", index);
+    ggml_backend_qnn_reg_context * ctx = (ggml_backend_qnn_reg_context *)reg->context;
+    GGML_ASSERT(index < ctx->devices.size());
+    return ctx->devices[index];
+}
+
+static void * ggml_backend_qnn_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    GGML_UNUSED(reg);
+
+    if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
+        return (void *)ggml_backend_qnn_set_n_threads;
+    }
+    return NULL;
+}
+
+static const ggml_backend_reg_i ggml_backend_qnn_reg_interface = {
+        /* .get_name          = */ ggml_backend_qnn_reg_get_name,
+        /* .get_device_count  = */ ggml_backend_qnn_reg_get_device_count,
+        /* .get_device        = */ ggml_backend_qnn_reg_get_device,
+        /* .get_proc_address  = */ ggml_backend_qnn_reg_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_qnn_reg() {
+    static ggml_backend_reg reg;
+    static bool initialized = false;
+    GGMLQNN_LOG_DEBUG("enter ggml_backend_qnn_reg");
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            ggml_backend_qnn_reg_context * ctx = new ggml_backend_qnn_reg_context;
+
+            for (int i = 0; i < ggml_backend_qnn_get_device_count(); i++) {
+                ggml_backend_dev_t dev = new ggml_backend_device {
+                        /* .iface       = */ ggml_backend_qnn_device_interface,
+                        /* .reg         = */ &reg,
+                        /* .context     = */ &g_qnn_mgr[i]
+                };
+                ctx->devices.push_back(dev);
+            }
+
+            reg = ggml_backend_reg {
+                    /* .api_version = */ GGML_BACKEND_API_VERSION,
+                    /* .iface       = */ ggml_backend_qnn_reg_interface,
+                    /* .context     = */ ctx
+            };
+        }
+
+        initialized = true;
+    }
+    GGMLQNN_LOG_DEBUG("leave ggml_backend_qnn_reg");
+
+    return &reg;
+}
+
+/**
+ *
+ * @param device            0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU
+ * @param qnn_lib_path      QNN binrary runtime library path, such as "/data/local/tmp/" on Android or specified in JNI layer
+ * @return
+ */
+ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
+    int result = 0;
+
+    if (nullptr == qnn_lib_path)
+        return nullptr;
+
+    GGMLQNN_LOG_DEBUG("device %d", device);
+    GGMLQNN_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path);
+    if (device >= GGML_QNN_MAX_DEVICES) {
+        GGMLQNN_LOG_ERROR("invalid device %d", device);
+        return nullptr;
+    }
+
+    if (nullptr != g_qnn_mgr[device].backend) {
+        GGMLQNN_LOG_WARN("qnn backend %d(%s) already loaded", device, ggml_backend_qnn_get_devname(device));
+        return g_qnn_mgr[device].backend;
+    }
+
+    std::string path = qnn_lib_path;
+    if (QNN_BACKEND_NPU == device) {
+        if (0 == setenv("LD_LIBRARY_PATH",
+                        (path +
+                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
+                        1)) {
+            GGMLQNN_LOG_INFO("QNN NPU backend setenv successfully");
+        } else {
+            GGMLQNN_LOG_ERROR("QNN NPU backend setenv failure");
+        }
+        if (0 == setenv("ADSP_LIBRARY_PATH",
+                        (path +
+                         ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(),
+                        1)) {
+            GGMLQNN_LOG_INFO("QNN NPU backend setenv successfully");
+        } else {
+            GGMLQNN_LOG_ERROR("QNN NPU backend setenv failure");
+        }
+    } else {
+        if (0 == setenv("LD_LIBRARY_PATH",
+                        (path +
+                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
+                        1)) {
+            GGMLQNN_LOG_INFO("%s backend setenv successfully\n", ggml_backend_qnn_get_devname(device));
+        } else {
+            GGMLQNN_LOG_ERROR("%s backend setenv failure\n", ggml_backend_qnn_get_devname(device));
+        }
+    }
+
+    qnn_instance * instance = nullptr;
+    instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, "");
+    result = instance->qnn_init(nullptr);
+    if (0 != result) {
+        GGMLQNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", ggml_backend_qnn_get_devname(device));
+        delete instance;
+        return nullptr;
+    }
+    qnn_interface qnn_interface                             = instance->get_qnn_interface();
+    if (!qnn_interface.is_loaded()) {
+        GGMLQNN_LOG_WARN("qnn subsystem failure\n");
+        delete instance;
+        return nullptr;
+    }
+
+    std::string device_name = ggml_backend_qnn_get_devname(device);
+    GGMLQNN_LOG_INFO("qnn device name %s", device_name.c_str());
+    g_qnn_mgr[device].instance                  = instance;
+    g_qnn_mgr[device].raw_interface             = instance->get_qnn_raw_interface();
+    g_qnn_mgr[device].raw_system_interface      = instance->get_qnn_raw_system_interface();
+
+    ggml_backend_t qnn_backend = new ggml_backend{
+            /* .guid      = */ ggml_backend_qnn_guid(),
+            /* .iface     = */ ggml_backend_qnn_interface,
+            /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_qnn_reg(), device),
+            /* .context   = */ &g_qnn_mgr[device]
+    };
+    g_qnn_mgr[device].backend   = qnn_backend;
+
+    return qnn_backend;
+}
+
+GGML_BACKEND_DL_IMPL(ggml_backend_qnn_reg)

From 02e77d078beb1015a1052ed86dbade2e07eb591c Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 19 Feb 2025 21:58:55 +0800
Subject: [PATCH 076/200] ggml-qnn: a concise approach to offload mulmat to QNN
 backend(sync from branch kantvai-ggmlqnn-npurpc,
 https://github.com/kantv-ai/llama.cpp/wiki/offloading-mulmat-to-QNN-backend)

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 626 ++++++++++++++++++++-------------
 1 file changed, 377 insertions(+), 249 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 6f2949333908e..a1aca7940bf4f 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -13,8 +13,9 @@
  * section-5 does ggml-qnn backend helper macro / data structure / function / class
  * section-6 does implementation of ggml-qnn backend according to ggml's backend subsystem
  *
- * currently only provide GGML_OP_ADD's QNN backend implementation:
- *    - GGML_OP_ADD: this is skeleton, can expand other ggml ops according to expertise
+ * currently only provide OPs' QNN backend implementation of GGML_OP_ADD & GGML_OP_MUL_MAT:
+ * - GGML_OP_ADD:    this is a simple skeleton, can expand other ggml ops according to expertise
+ * - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex op accordingly
  *
  * of course, can porting ggml-qnn to Windows on ARM as need.
  *
@@ -257,20 +258,25 @@ static void * ggmlqnn_host_malloc(size_t n) {
 // =================================================================================================
 //  section-4: QNN helper macro / data structure / function
 // =================================================================================================
-#define VALIDATE(value, status)                         \
-  do {                                                  \
-    status = value;                                     \
-    if (status != QNN_SUCCESS) {                        \
-      GGMLQNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value);       \
-      return status;                                    \
-    }                                                   \
+#define VALIDATE(value, status)                                                 \
+  do {                                                                          \
+    status = value;                                                             \
+    if (status != QNN_SUCCESS) {                                                \
+      GGMLQNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value);                    \
+      return status;                                                            \
+    }                                                                           \
   } while (0)
 
-#define CHECK_QNN_API(error)                            \
-    do {                                                \
-        if (QNN_SUCCESS != (error)) {                   \
-            GGMLQNN_LOG_INFO("error = %d\n", (error));  \
-        }                                               \
+#define CHECK_QNN_API(error, result)                                            \
+    do {                                                                        \
+        error = (result);                                                       \
+        if (QNN_SUCCESS != error) {                                             \
+            if (error == QNN_COMMON_ERROR_NOT_SUPPORTED) {                      \
+                GGMLQNN_LOG_WARN("WARNING: QNN feature/API not supported\n");   \
+            } else {                                                            \
+                GGMLQNN_LOG_INFO("QNN API error = %d(%s)\n", error, qnn_get_error_string(error));  \
+            }                                                                   \
+        }                                                                       \
     } while (0)
 
 #define VALIDATE_TENSOR_VERSION(tensor, err)            VALIDATE(validate_tensor_version(tensor), err)
@@ -823,9 +829,8 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
 
     uint32_t rank = QNN_TENSOR_GET_RANK(src);
     QNN_TENSOR_SET_RANK(dst, rank);
-    size_t dim_size       = rank * sizeof(uint32_t);
+    size_t dim_size       = GGML_MAX_DIMS * sizeof(uint32_t);
     uint32_t * dimensions = (uint32_t *)malloc(dim_size);
-    GGMLQNN_LOG_DEBUG("tensor dims %p", dimensions);
     if (dimensions == nullptr) {
         GGMLQNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src));
         return 1;
@@ -1025,6 +1030,9 @@ using _pfn_QnnSaver_initialize                          = decltype(QnnSaver_init
 using _pfn_QnnInterface_getProviders                    = decltype(QnnInterface_getProviders);
 using _pfn_QnnSystemInterface_getProviders              = decltype(QnnSystemInterface_getProviders);
 
+using qnn_res_t                                         = std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>;
+using qnn_tensors_t                                     = std::vector< Qnn_Tensor_t *>;
+
 enum class ggml_qnn_profile_level {
     profile_off     = 0,
     profile_basic   = 1,
@@ -1122,12 +1130,9 @@ struct ggml_backend_qnn_context {
     QNN_INTERFACE_VER_TYPE raw_interface;
     QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
     struct qcom_socinfo           socinfo;
-
-    //FIXME: should I move it from public member of class qnn_instance to here?
-    //std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *, Qnn_Tensor_t *, Qnn_Tensor_t *>> _qnn_graph_map;
 } ;
 
-//FIXME: the following global vars and three helper funcs should be removed in the future
+//TODO: the following global vars and three helper funcs should be removed in the future
 static int32_t  g_ggmltensor_idx    = 0;
 static void reset_idx() {
     g_ggmltensor_idx = 0;
@@ -1399,11 +1404,11 @@ static const char * ggml_get_type_name(ggml_type type) {
     return traits->type_name;
 }
 
-Qnn_Tensor_t * ggml_qnn_create_tensor(const ggml_tensor * tensor) {
+static Qnn_Tensor_t * ggml_qnn_create_compute_tensor(const ggml_tensor * tensor) {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     char tensor_name[GGML_MAX_NAME] = {0};
 
-    //FIXME:remove get_idx() and inc_idx() in the future but ensure the tensor name is unique
+    //TODO:remove get_idx() and inc_idx() in the future but ensure the tensor name is unique
     snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", get_idx());
     GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
     inc_idx();
@@ -1450,6 +1455,73 @@ Qnn_Tensor_t * ggml_qnn_create_tensor(const ggml_tensor * tensor) {
     return p_qnn_tensor;
 }
 
+static Qnn_Tensor_t * ggml_qnn_create_mulmat_tensor(const ggml_tensor * tensor, const char * name, Qnn_TensorType_t qnn_tensor_type,
+                                                    Qnn_DataType_t qnn_data_type, uint32_t rank, uint32_t * dims, void * data, uint32_t data_size) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    char tensor_name[GGML_MAX_NAME] = {0};
+
+    //TODO:remove get_idx() and inc_idx() in the future but ensure the tensor name is unique
+    if (nullptr != name) {
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", get_idx());
+    } else {
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, get_idx());
+    }
+    GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
+    inc_idx();
+
+    //there are different dimension order between ggml tensor and qnn tensor
+    uint32_t dimensions_transpose[GGML_MAX_DIMS] = {};
+    uint32_t * tensor_dims = nullptr;
+
+    if (nullptr != tensor) {
+        dimensions_transpose[0] = (uint32_t) tensor->ne[1];
+        dimensions_transpose[1] = (uint32_t) tensor->ne[0];
+        dimensions_transpose[2] = (uint32_t) tensor->ne[2];
+        dimensions_transpose[3] = (uint32_t) tensor->ne[3];
+        tensor_dims = dimensions_transpose;
+    }
+    if (nullptr != dims) {
+        tensor_dims = dims;
+    }
+
+    Qnn_Tensor_t qnn_tensor = {
+            .version= QNN_TENSOR_VERSION_1,
+            {.v1= {
+                    .id = 0,
+                    .name = tensor_name,
+                    .type = qnn_tensor_type,
+                    .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
+                    .dataType = qnn_data_type,
+                    .quantizeParams = {QNN_DEFINITION_UNDEFINED,
+                                       QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                                       {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}},
+                    .rank = rank,
+                    .dimensions = tensor_dims,
+                    .memType = QNN_TENSORMEMTYPE_RAW,
+                    {.clientBuf = {nullptr, 0}
+                    }
+            }
+            }
+    };
+    if (nullptr != name) {
+        QNN_VER_PTR(qnn_tensor)->name = name;
+    }
+    Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
+    if (nullptr == p_qnn_tensor) {
+        GGMLQNN_LOG_WARN("calloc failed");
+        return nullptr;
+    }
+    error = deep_copy_qnn_tensors(qnn_tensor, * p_qnn_tensor);
+    if (error != QNN_SUCCESS) {
+        free(p_qnn_tensor);
+        GGMLQNN_LOG_WARN("init tensor failed");
+        return  nullptr;
+    }
+    QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {data, data_size};
+
+    return p_qnn_tensor;
+}
+
 //TODO:
 // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
 static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
@@ -1908,7 +1980,7 @@ class qnn_instance {
     }
 
 public:
-    std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *, Qnn_Tensor_t *, Qnn_Tensor_t *>> _qnn_graph_map;
+    std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>> _qnn_graph_map;
 
 private:
     int load_system();
@@ -1988,7 +2060,7 @@ class qnn_instance {
 
     std::string _graph_name;
     QNNBackend _device_id;
-    bool       _enable_qnn_rpc = false; //FIXME:unknown issue with QNN RPC feature
+    bool       _enable_qnn_rpc = false; //TODO:unknown issue with QNN RPC feature
 
     DISABLE_COPY(qnn_instance);
     DISABLE_MOVE(qnn_instance);
@@ -2207,7 +2279,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     GGMLQNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str());
 
-    void *lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+    void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
     if (nullptr == lib_handle) {
         GGMLQNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror());
         return 1;
@@ -2223,7 +2295,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
 
     // get QnnInterface Providers
     std::uint32_t num_providers = 0;
-    const QnnInterface_t **provider_list = nullptr;
+    const QnnInterface_t ** provider_list = nullptr;
     error = get_providers(&provider_list, &num_providers);
     if (error != QNN_SUCCESS) {
         GGMLQNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error));
@@ -2282,8 +2354,9 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
     QnnSaver_Config_t backendid_cfg;
     backendid_cfg.option = QNN_SAVER_CONFIG_OPTION_BACKEND_ID;
     backendid_cfg.backendId = _backend_id;
-    const QnnSaver_Config_t *saverCfg[] = {&outputdir_cfg, &backendid_cfg, nullptr};
-    if (0 == QnnSaver_initialize(saverCfg)) {
+
+    const QnnSaver_Config_t * saver_cfg[] = {&outputdir_cfg, &backendid_cfg, nullptr};
+    if (0 == QnnSaver_initialize(saver_cfg)) {
         GGMLQNN_LOG_INFO("QnnSaver_initialize successfully");
     } else {
         GGMLQNN_LOG_WARN("QnnSaver_initialize failure");
@@ -2668,7 +2741,7 @@ int qnn_instance::qnn_finalize() {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
     GGMLQNN_LOG_DEBUG("enter %s\n", __func__);
-    //FIXME:should be removed in the future
+    //TODO:should be removed in the future
     reset_idx();
 
     free_rpcmem();
@@ -2971,6 +3044,20 @@ static void dump_tensors_info(const struct ggml_tensor * tensor) {
             tensor->nb[1], tensor->nb[2]);
 }
 
+//TODO: currently only support offloading 2D matrix to QNN backend
+static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, uint32_t * ggml_dimensions, uint32_t rank) {
+    if (rank > GGML_MAX_DIMS) {
+        GGMLQNN_LOG_WARN("invalid params");
+        return;
+    }
+    if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) {
+        GGMLQNN_LOG_WARN("invalid params");
+        return;
+    }
+    qnn_dimensions[0] = ggml_dimensions[1];
+    qnn_dimensions[1] = ggml_dimensions[0];
+}
+
 // =================================================================================================
 //  section-6: implementation of ggml-qnn backend
 // =================================================================================================
@@ -3010,7 +3097,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
             return false;
         
 #if GGMLQNN_PRINT_OP_ADD_LOG
-        dump_tensors_info(tensor);
+        //dump_tensors_info(tensor);
 #endif
         return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
                && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
@@ -3019,27 +3106,21 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
 
     if (tensor->op == GGML_OP_MUL_MAT) {
 #if GGMLQNN_PRINT_OP_MUL_MAT_LOG
-        dump_tensors_info(tensor);
+        //dump_tensors_info(tensor);
 #endif
-        //FIXME: 2048 is an experimental value between ASR inference and LLM inference because
-        //       it's better only offload big matrix to QNN backend
-        if (ne01 <= 2048) {
+        uint32_t src0_rank = ggml_get_tensor_rank(src0);
+        uint32_t src1_rank = ggml_get_tensor_rank(src1);
+
+        if ((src0_rank != 2) || (src1_rank != 2)) //TODO: only support offload 2D matrix mulmat to QNN backend
             return false;
-        }
-#if 0
-        //TODO: offload mul_mat to QNN backend
-        //need to process type trait in func ggml_qnn_mul_mat(...):
+
+        //TODO: support more data type in func ggml_qnn_mul_mat(...):
         //src0: q4_0, q6_k, ...
         //src1: f32
         //dst : f32
-        return (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
-                && (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_F16);
-#else
-        //fall back to ggml cpu backend
         return  (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
                 && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
                 && (src0->type == src1->type) && (src0->type == tensor->type);
-#endif
     }
 
     //TODO:for other op
@@ -3054,65 +3135,51 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
     bool graph_initialized                      = false;
     qnn_instance * instance                     = nullptr;
     ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *)backend->context;
-    std::string graph_name                      = "ggml_op_qnn_add";
     qnn_perf op_perf                            = qnn_perf("ggml_qnn_add");
     Qnn_GraphHandle_t graph_handle              = nullptr;
-    Qnn_Tensor_t * tensor_0                     = nullptr;
-    Qnn_Tensor_t * tensor_1                     = nullptr;
-    Qnn_Tensor_t * tensor_2                     = nullptr;
+    Qnn_Tensor_t * p_tensor0                    = nullptr;
+    Qnn_Tensor_t * p_tensor1                    = nullptr;
+    Qnn_Tensor_t * p_tensor2                    = nullptr;
     Qnn_Param_t qnn_params[]                    = {};
-    enum ggml_op ggmlop                         = GGML_OP_ADD;
-    Qnn_DataType_t src0_qnn_type                = QNN_DATATYPE_FLOAT_32;
-    Qnn_DataType_t src1_qnn_type                = QNN_DATATYPE_FLOAT_32;
-    Qnn_DataType_t dst_qnn_type                 = QNN_DATATYPE_FLOAT_32;
     const ggml_tensor * src0                    = op->src[0];
     const ggml_tensor * src1                    = op->src[1];
     ggml_tensor * dst                           = op;
 
-    uint8_t * qnn_rpcbuffer_0                   = nullptr;
-    uint8_t * qnn_rpcbuffer_1                   = nullptr;
-    uint8_t * qnn_rpcbuffer_2                   = nullptr;
-
     GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-
     instance                                    = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-
     op_perf.start();
 
-    std::string map_entry;
-    get_graph_key_from_op(op, map_entry);
-    if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) {
+    std::string graph_name;
+    get_graph_key_from_op(op, graph_name);
+    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
         graph_initialized = true;
-        auto & graph_item = instance->_qnn_graph_map[map_entry];
+        qnn_res_t & graph_item = instance->_qnn_graph_map[graph_name];
         graph_handle = std::get<0>(graph_item);
-        tensor_0     = std::get<1>(graph_item);
-        tensor_1     = std::get<2>(graph_item);
-        tensor_2     = std::get<3>(graph_item);
+        qnn_tensors_t & tensor = std::get<1>(graph_item);
+        p_tensor0     = tensor[0];
+        p_tensor1     = tensor[1];
+        p_tensor2     = tensor[2];
     } else {
-        tensor_0 = ggml_qnn_create_tensor(src0);
-        tensor_1 = ggml_qnn_create_tensor(src1);
-        tensor_2 = ggml_qnn_create_tensor(dst);
+        p_tensor0 = ggml_qnn_create_compute_tensor(src0);
+        p_tensor1 = ggml_qnn_create_compute_tensor(src1);
+        p_tensor2 = ggml_qnn_create_compute_tensor(dst);
     }
-
     print_tensors_info(__func__, ctx, src0, src1, dst);
 
-    QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
-
-    src0_qnn_type                   = qnn_datatype_from_ggml_datatype(src0->type);
-    src1_qnn_type                   = qnn_datatype_from_ggml_datatype(src1->type);
-    dst_qnn_type                    = qnn_datatype_from_ggml_datatype(dst->type);
+    //ensure QNN tensor has correct tensor type
+    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
 
-    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions;
-    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
-    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
+    //save the original dimensions of qnn tensors
+    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
+    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
+    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
 
     bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
 
     if (!graph_initialized) {
-        graph_name = map_entry;
         GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
         if (ctx->device == QNN_BACKEND_NPU) {
             error = create_htp_graph(ctx, graph_name, &graph_handle);
@@ -3127,44 +3194,44 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
         }
 
         if (enable_npu_rpc) {
-            QNN_VER_PTR(*tensor_0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*tensor_0)->clientBuf = {.data=nullptr, .dataSize=0};
+            QNN_VER_PTR(*p_tensor0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {.data=nullptr, .dataSize=0};
 
-            QNN_VER_PTR(*tensor_1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*tensor_1)->clientBuf = {.data=nullptr, .dataSize=0};
+            QNN_VER_PTR(*p_tensor1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {.data=nullptr, .dataSize=0};
 
-            QNN_VER_PTR(*tensor_2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*tensor_2)->clientBuf = {.data=nullptr, .dataSize=0};
+            QNN_VER_PTR(*p_tensor2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*p_tensor2)->clientBuf = {.data=nullptr, .dataSize=0};
         }
 
-        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0));
-        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1));
-        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
 
         if (enable_npu_rpc) {
-            qnn_rpcbuffer_0 = create_rpc_buffer(instance, src0, tensor_0, true);
-            qnn_rpcbuffer_1 = create_rpc_buffer(instance, src1, tensor_1, true);
-            qnn_rpcbuffer_2 = create_rpc_buffer(instance, dst, tensor_2, false);
+            uint8_t * qnn_rpcbuffer_0 = create_rpc_buffer(instance, src0, p_tensor0, true);
+            uint8_t * qnn_rpcbuffer_1 = create_rpc_buffer(instance, src1, p_tensor1, true);
+            uint8_t * qnn_rpcbuffer_2 = create_rpc_buffer(instance, dst, p_tensor2, false);
             if (nullptr == qnn_rpcbuffer_0 || nullptr == qnn_rpcbuffer_1 || nullptr == qnn_rpcbuffer_2) {
                 GGMLQNN_LOG_INFO("create rpc buffer failure\n");
-                //FIXME: potential memory leak althought it shouldn't happen
+                //TODO: potential memory leak although it shouldn't happen
                 return;
             }
         } else {
-            QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-            QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-            QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
         }
 
         Qnn_Tensor_t tensor_inputs[] = {
-                *tensor_0,
-                *tensor_1
+                *p_tensor0,
+                *p_tensor1
         };
         Qnn_Tensor_t tensor_outputs[] = {
-                *tensor_2
+                *p_tensor2
         };
         Qnn_OpConfig_t op_config = {
-                (Qnn_OpConfigVersion_t) 1, .v1 = {
+                QNN_OPCONFIG_VERSION_1, .v1 = {
                         "ggml_op_add",
                         QNN_OP_PACKAGE_NAME_QTI_AISW,
                         QNN_OP_ELEMENT_WISE_ADD,
@@ -3176,26 +3243,38 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
                         tensor_outputs
                 }
         };
-        CHECK_QNN_API(error = qnn_raw_interface.graphAddNode(graph_handle, op_config));
-        CHECK_QNN_API(error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-        error = qnn_raw_interface.graphExecute(graph_handle,
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
                                                tensor_inputs, 2,
                                                tensor_outputs, 1,
-                                               nullptr, nullptr);
-        CHECK_QNN_API(error);
+                                               nullptr, nullptr));
 
         if (enable_npu_rpc) {
-            uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_2)->memHandle));
+            uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
             GGMLQNN_LOG_INFO("qnn_rpcbuffer = %p\n", qnn_rpcbuffer);
             if (nullptr != qnn_rpcbuffer) {
                 memcpy(dst->data, qnn_rpcbuffer, ggml_nbytes(dst));
             }
         }
 
-        auto  graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
-        instance->_qnn_graph_map[map_entry] = graph_item;
+        qnn_tensors_t ggml_op_add_tensors;
+        ggml_op_add_tensors.reserve(3);
+        ggml_op_add_tensors.push_back(p_tensor0);
+        ggml_op_add_tensors.push_back(p_tensor1);
+        ggml_op_add_tensors.push_back(p_tensor2);
+
+        auto  graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
+        instance->_qnn_graph_map[graph_name] = graph_item;
 
     } else {
+        Qnn_DataType_t src0_qnn_type    = QNN_DATATYPE_FLOAT_32;
+        Qnn_DataType_t src1_qnn_type    = QNN_DATATYPE_FLOAT_32;
+        Qnn_DataType_t dst_qnn_type     = QNN_DATATYPE_FLOAT_32;
+
+        src0_qnn_type                   = qnn_datatype_from_ggml_datatype(src0->type);
+        src1_qnn_type                   = qnn_datatype_from_ggml_datatype(src1->type);
+        dst_qnn_type                    = qnn_datatype_from_ggml_datatype(dst->type);
 
         uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
                                          (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
@@ -3204,76 +3283,76 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
         uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
                                          (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
 
-        QNN_VER_PTR(*tensor_0)->dimensions  = dimensions_input_0;
-        QNN_VER_PTR(*tensor_0)->rank        = ggml_get_tensor_rank(src0);
-        QNN_VER_PTR(*tensor_0)->dataType    = src0_qnn_type;
+        QNN_VER_PTR(*p_tensor0)->dimensions  = dimensions_input_0;
+        QNN_VER_PTR(*p_tensor0)->rank        = ggml_get_tensor_rank(src0);
+        QNN_VER_PTR(*p_tensor0)->dataType    = src0_qnn_type;
 
-        QNN_VER_PTR(*tensor_1)->dimensions  = dimensions_input_1;
-        QNN_VER_PTR(*tensor_1)->rank        = ggml_get_tensor_rank(src1);
-        QNN_VER_PTR(*tensor_1)->dataType    = src1_qnn_type;
+        QNN_VER_PTR(*p_tensor1)->dimensions  = dimensions_input_1;
+        QNN_VER_PTR(*p_tensor1)->rank        = ggml_get_tensor_rank(src1);
+        QNN_VER_PTR(*p_tensor1)->dataType    = src1_qnn_type;
 
-        QNN_VER_PTR(*tensor_2)->dimensions  = dimensions_output;
-        QNN_VER_PTR(*tensor_2)->rank        = ggml_get_tensor_rank(dst);
-        QNN_VER_PTR(*tensor_2)->dataType    = dst_qnn_type;
+        QNN_VER_PTR(*p_tensor2)->dimensions  = dimensions_output;
+        QNN_VER_PTR(*p_tensor2)->rank        = ggml_get_tensor_rank(dst);
+        QNN_VER_PTR(*p_tensor2)->dataType    = dst_qnn_type;
 
         if (enable_npu_rpc) {
-            //FIXME:why failure with test-backend-ops
-            uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_0)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_rpcbuffer_0);
+            //TODO: NPU RPC feature will failed with test-backend-ops
+            uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
             if (nullptr != qnn_buffer_0) {
                 memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
             }
 
-            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_1)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_rpcbuffer_1);
+            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor1)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
             if (nullptr != qnn_buffer_1) {
                 memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
             }
         } else {
-            QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-            QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-            QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
         }
 
         Qnn_Tensor_t tensor_inputs[] = {
-                *tensor_0,
-                *tensor_1
+                *p_tensor0,
+                *p_tensor1
         };
         Qnn_Tensor_t tensor_outputs[] = {
-                *tensor_2
+                *p_tensor2
         };
-        error = qnn_raw_interface.graphExecute(graph_handle,
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
                                                tensor_inputs, 2,
                                                tensor_outputs, 1,
-                                               nullptr, nullptr);
-        CHECK_QNN_API(error);
+                                               nullptr, nullptr));
 
         if (enable_npu_rpc) {
-            //FIXME:why failure with test-backend-ops
-            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*tensor_2)->memHandle));
+            //TODO:NPU RPC feature will failed with test-backend-ops
+            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
             if (nullptr != qnn_buffer_2) {
                 memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
             }
         }
     }
 
-    //avoid memory leak in func free_qnn_tensor
-    QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions;
-    QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
-    QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions;
+    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
+    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
+    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
+    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
 #if GGMLQNN_PRINT_OP_ADD_LOG
     op_perf.info();
 #endif
 }
 
-//TODO:
 /*
- * the logic of ggml_qnn_mul_mat is similar to ggml_qnn_add,but type trait and matrix transpose are required
- * for offload mulmat to QNN backend, so it's a standalone function.
+ * the logic of ggml_qnn_mul_mat is similar to ggml_qnn_add but much more complicated than ggml_qnn_add,
+ * matrix transpose and type trait are required for offload mulmat to QNN backend,
+ * so it's a standalone function. accordingly, this is another typical skeleton for offload other
+ * ggml ops to QNN backend
  *
  * MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, we should focus on MUL_MAT.
  *
- * we have three kinds of MUL_MAT to compute:
+ * have three kinds of MUL_MAT to compute:
  * mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
  * mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
  * mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, ...) and src1 is F32, src0 -> f32 in src0', then src0' * src1
@@ -3284,148 +3363,200 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
     qnn_perf op_perf                            = qnn_perf("ggml_qnn_mul_mat");
     qnn_instance * instance                     = nullptr;
     ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *) backend->context;
-
-    std::string graph_name                      = "ggml_op_qnn_mul_mat";
     Qnn_GraphHandle_t graph_handle              = nullptr;
-    Qnn_Tensor_t * tensor_0                     = nullptr;
-    Qnn_Tensor_t * tensor_1                     = nullptr;
-    Qnn_Tensor_t * tensor_2                     = nullptr;
-
-    Qnn_Param_t qnn_params[]                    = {};
-
-    enum ggml_op ggmlop                         = GGML_OP_ADD;
-    Qnn_DataType_t src0_qnn_type                = QNN_DATATYPE_FLOAT_32;
-    Qnn_DataType_t src1_qnn_type                = QNN_DATATYPE_FLOAT_32;
-    Qnn_DataType_t dst_qnn_type                 = QNN_DATATYPE_FLOAT_32;
+    Qnn_Tensor_t * p_tensor0                    = nullptr;
+    Qnn_Tensor_t * p_tensor1                    = nullptr;
+    Qnn_Tensor_t * p_tensor2                    = nullptr;
+    Qnn_Tensor_t * p_param_tensor               = nullptr;
+    Qnn_Tensor_t * p_tensor2_transpose          = nullptr;
     const ggml_tensor * src0                    = op->src[0];
     const ggml_tensor * src1                    = op->src[1];
-    ggml_tensor * dst                           = op;
+    ggml_tensor       * dst                     = op;
 
     GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-
     instance                                    = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-
     op_perf.start();
 
-    std::string map_entry;
-    get_graph_key_from_op(op, map_entry);
-    if (instance->_qnn_graph_map.find(map_entry) != instance->_qnn_graph_map.end()) {
-        graph_initialized = true;
-        auto & graph_item = instance->_qnn_graph_map[map_entry];
-        graph_handle = std::get<0>(graph_item);
-        tensor_0     = std::get<1>(graph_item);
-        tensor_1     = std::get<2>(graph_item);
-        tensor_2     = std::get<3>(graph_item);
+    std::string graph_name;
+    get_graph_key_from_op(op, graph_name);
+    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+        graph_initialized       = true;
+        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
+        graph_handle            = std::get<0>(graph_item);
+        qnn_tensors_t & tensors = std::get<1>(graph_item);
+        p_tensor0               = tensors[0];
+        p_tensor1               = tensors[1];
+        p_tensor2               = tensors[2];
+        p_param_tensor          = tensors[3];
+        p_tensor2_transpose     = tensors[4];
     } else {
-        tensor_0 = ggml_qnn_create_tensor(src0);
-        tensor_1 = ggml_qnn_create_tensor(src1);
-        tensor_2 = ggml_qnn_create_tensor(dst);
+        p_tensor0 = ggml_qnn_create_mulmat_tensor(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        p_tensor1 = ggml_qnn_create_mulmat_tensor(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        p_tensor2 = ggml_qnn_create_mulmat_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
     }
 
     print_tensors_info(__func__, ctx, src0, src1, dst);
 
-    QNN_VER_PTR(*tensor_0)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*tensor_1)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*tensor_2)->type = QNN_TENSOR_TYPE_APP_READ;
-
-    src0_qnn_type                   = qnn_datatype_from_ggml_datatype(src0->type);
-    src1_qnn_type                   = qnn_datatype_from_ggml_datatype(src1->type);
-    dst_qnn_type                    = qnn_datatype_from_ggml_datatype(dst->type);
+    //ensure QNN tensor has correct tensor type
+    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
 
-    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*tensor_0)->dimensions;
-    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*tensor_1)->dimensions;
-    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*tensor_2)->dimensions;
+    //save the original dimensions of qnn tensors
+    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
+    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
+    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
 
     if (!graph_initialized) {
-        graph_name = map_entry;
         GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
+        /*
+         there are two key-points in properly handling how to offload mulmat to the QNN backend in ggml-qnn
+         1. transpose
+            a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from:
+            struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
+            which like this:
+            +---+---+
+            | 0 | 1 |
+            +---+---+
+            | 2 | 3 |
+            +---+---+
+            | 4 | 5 |
+            +---+---+
+            with
+                ne[0] = 2
+                ne[1] = 3
+            there are different dimension order between ggml tensor and qnn tensor
+
+          2. QNN's MatMul can only support input tensors with rank >= 2
+
+        there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose operation when offloading mulmat to QNN backend.
+        */
+
+        //step-1: create qnn graph
         error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
                                               graph_name.c_str(), nullptr, &graph_handle);
         if (QNN_SUCCESS != error) {
             GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
             return;
         }
-        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_0));
-        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_1));
-        CHECK_QNN_API(error = qnn_raw_interface.tensorCreateGraphTensor(graph_handle, tensor_2));
-
-        QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-        QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+        //step-2: create param tensor for mulmat of 2d matrix
+        uint32_t param_tensor_dims[] = {2};
+        uint32_t param_tensor_data[2] = {1, 0};
+        p_param_tensor = ggml_qnn_create_mulmat_tensor(nullptr, "param", QNN_TENSOR_TYPE_STATIC,QNN_DATATYPE_UINT_32, 1, param_tensor_dims, param_tensor_data, 8);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
+
+        //step-3: create compute tensor from ggml tensor
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
+
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+
+        //step-4: create a transpose tensor
+        uint32_t tensor2_transpose_dims[GGML_MAX_DIMS] = {};
+        p_tensor2_transpose = ggml_qnn_create_mulmat_tensor(dst,"transpose",QNN_TENSOR_TYPE_NATIVE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        get_qnn_dimensions_from_ggml_dimensions(tensor2_transpose_dims, tensor_2_dimensions,ggml_get_tensor_rank(dst));
+        //save pointer because the dimensions of tensor p_tensor2_transpose will be changed later
+        uint32_t * tensor2_dimensions_transpose = QNN_VER_PTR(*p_tensor2_transpose)->dimensions;
+        //update dimensions of tensor p_tensor2_transpose to make QNN SDK happy
+        QNN_VER_PTR(*p_tensor2_transpose)->dimensions = tensor2_transpose_dims;
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2_transpose));
+
+        //step-5: compose qnn graph: add mat_mul node
+        Qnn_Param_t out_0_params[] = {
+                {QNN_PARAMTYPE_SCALAR,
+                           QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
+                             .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}
+                }
+        };
 
-        Qnn_Tensor_t tensor_inputs[] = {
-                *tensor_0,
-                *tensor_1
+        Qnn_Tensor_t out_0_inputs[] = {*p_tensor0,*p_tensor1};
+        Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
+        Qnn_OpConfig_t out_0 = {
+                QNN_OPCONFIG_VERSION_1, .v1 =
+                        {"ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
+                         1,
+                         out_0_params,
+                         2,
+                         out_0_inputs,
+                         1,
+                         out_0_outputs}
         };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *tensor_2
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
+
+        //step-5: compose qnn graph: add transpose node
+        Qnn_Param_t out_trans1_0_params[] = {
+                {(Qnn_ParamType_t) 1,
+                 "perm", .tensorParam = *p_param_tensor
+                }
         };
-        Qnn_OpConfig_t op_config = {
-                (Qnn_OpConfigVersion_t) 1, .v1 = {
-                        "ggml_op_mul_mat",
-                        QNN_OP_PACKAGE_NAME_QTI_AISW,
-                        QNN_OP_MAT_MUL,
-                        0,
-                        qnn_params,
-                        2,
-                        tensor_inputs,
+        Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
+        Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
+        Qnn_OpConfig_t out_trans1_0 = {
+                QNN_OPCONFIG_VERSION_1,
+                .v1 =  {"ggmlqnn_mulmat_transpose_opconfig",
+                        "qti.aisw",
+                        QNN_OP_TRANSPOSE, 1,
+                        out_trans1_0_params,
                         1,
-                        tensor_outputs
-                }
+                        out_trans1_0_inputs,
+                        1,
+                        out_trans1_0_outputs}
         };
-        CHECK_QNN_API(error = qnn_raw_interface.graphAddNode(graph_handle, op_config));
-        CHECK_QNN_API(error = qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-        error = qnn_raw_interface.graphExecute(graph_handle,
-                                               tensor_inputs, 2,
-                                               tensor_outputs, 1,
-                                               nullptr, nullptr);
-        CHECK_QNN_API(error);
-        auto  graph_item = std::make_tuple(graph_handle, tensor_0, tensor_1, tensor_2);
-        instance->_qnn_graph_map[map_entry] = graph_item;
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
+
+        //step-6: finalize qnn graph and execute qnn graph
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
+        Qnn_Tensor_t input_tensors_0[] = {*p_tensor0,*p_tensor1};
+        Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                               input_tensors_0, 2,
+                                               output_tensors_0, 1,
+                                               NULL, NULL));
+
+        qnn_tensors_t ggml_op_mulmat_tensors;
+        ggml_op_mulmat_tensors.reserve(5);
+        ggml_op_mulmat_tensors.push_back(p_tensor0);
+        ggml_op_mulmat_tensors.push_back(p_tensor1);
+        ggml_op_mulmat_tensors.push_back(p_tensor2);
+        ggml_op_mulmat_tensors.push_back(p_param_tensor);
+        ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
+
+        auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+        instance->_qnn_graph_map[graph_name] = graph_item;
+
+        //avoid cleanup these resource to make test_backend_ops happy
+        //free_qnn_tensor(p_param_tensor);
+        //restore pointer to avoid memory leak
+        QNN_VER_PTR(*p_tensor2_transpose)->dimensions = tensor2_dimensions_transpose;
+        //free_qnn_tensor(p_tensor2_transpose);
 
     } else {
 
-        uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-                                         (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
-        uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-                                         (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
-        uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-                                         (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
-        QNN_VER_PTR(*tensor_0)->dimensions = dimensions_input_0;
-        QNN_VER_PTR(*tensor_0)->rank = ggml_get_tensor_rank(src0);
-        QNN_VER_PTR(*tensor_0)->dataType = src0_qnn_type;
-
-        QNN_VER_PTR(*tensor_1)->dimensions = dimensions_input_1;
-        QNN_VER_PTR(*tensor_1)->rank = ggml_get_tensor_rank(src1);
-        QNN_VER_PTR(*tensor_1)->dataType = src1_qnn_type;
-
-        QNN_VER_PTR(*tensor_2)->dimensions = dimensions_output;
-        QNN_VER_PTR(*tensor_2)->rank = ggml_get_tensor_rank(dst);
-        QNN_VER_PTR(*tensor_2)->dataType = dst_qnn_type;
-
-        QNN_VER_PTR(*tensor_0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-        QNN_VER_PTR(*tensor_1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*tensor_2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
 
         Qnn_Tensor_t tensor_inputs[] = {
-                *tensor_0,
-                *tensor_1
+                *p_tensor0,
+                *p_tensor1
         };
         Qnn_Tensor_t tensor_outputs[] = {
-                *tensor_2
+                *p_tensor2
         };
-        error = qnn_raw_interface.graphExecute(graph_handle,
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
                                               tensor_inputs, 2,
                                              tensor_outputs, 1,
-                                         nullptr, nullptr);
-        CHECK_QNN_API(error);
+                                         nullptr, nullptr));
     }
 
-    //avoid memory leak in func free_qnn_tensor
-    QNN_VER_PTR(*tensor_0)->dimensions = tensor_0_dimensions;
-    QNN_VER_PTR(*tensor_1)->dimensions = tensor_1_dimensions;
-    QNN_VER_PTR(*tensor_2)->dimensions = tensor_2_dimensions;
+    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
+    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
+    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
+    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
 
     op_perf.info();
 }
@@ -3608,21 +3739,18 @@ static void ggml_backend_qnn_free(ggml_backend_t backend) {
 
     qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance;
     if (instance != nullptr) {
-        std::map<std::string, std::tuple<Qnn_GraphHandle_t, Qnn_Tensor_t *,
-                                        Qnn_Tensor_t *, Qnn_Tensor_t *>>::iterator graph_it;
+        std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector<Qnn_Tensor_t*>>>::iterator graph_it;
 
         for (graph_it = instance->_qnn_graph_map.begin();
              graph_it != instance->_qnn_graph_map.end(); graph_it++) {
             auto & graph_item = graph_it->second;
             Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item);
-            Qnn_Tensor_t *  tensor_0     = std::get<1>(graph_item);
-            Qnn_Tensor_t *  tensor_1     = std::get<2>(graph_item);
-            Qnn_Tensor_t *  tensor_2     = std::get<3>(graph_item);
+            qnn_tensors_t &  tensors = std::get<1>(graph_item);
+            for (auto tensor_it = tensors.begin(); tensor_it != tensors.end(); ++tensor_it) {
+                free_qnn_tensor(*tensor_it);
+            }
             GGML_UNUSED(graph_handle);
             GGMLQNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str());
-            free_qnn_tensor(tensor_0);
-            free_qnn_tensor(tensor_1);
-            free_qnn_tensor(tensor_2);
         }
         instance->_qnn_graph_map.clear();
 

From 0d6dffcb03af3e6659dc155f243d861dc0d05ef2 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 20 Feb 2025 08:39:15 +0800
Subject: [PATCH 077/200] ggml-qnn: remove redundant codes

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 298 +++++++++++----------------------
 1 file changed, 97 insertions(+), 201 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index a1aca7940bf4f..37c947f412f1f 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -1404,58 +1404,69 @@ static const char * ggml_get_type_name(ggml_type type) {
     return traits->type_name;
 }
 
-static Qnn_Tensor_t * ggml_qnn_create_compute_tensor(const ggml_tensor * tensor) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    char tensor_name[GGML_MAX_NAME] = {0};
-
-    //TODO:remove get_idx() and inc_idx() in the future but ensure the tensor name is unique
-    snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", get_idx());
-    GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
-    inc_idx();
-
-    uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
-                             (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
-    Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
-    Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+static const char * get_ggml_type_name(ggml_type type) {
+    const auto * traits = ggml_get_type_traits(type);
+    return traits->type_name;
+}
 
-    if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
-        qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
-    } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
-        qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
-    }
-    Qnn_Tensor_t qnn_tensor = {
-            .version= QNN_TENSOR_VERSION_1,
-            {.v1= {
-                    .id = 0,
-                    .name = tensor_name,
-                    .type = qnn_tensor_type,
-                    .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
-                    .dataType = qnn_data_type,
-                    .quantizeParams = {QNN_DEFINITION_UNDEFINED,
-                                       QNN_QUANTIZATION_ENCODING_UNDEFINED,
-                                       {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}},
-                    .rank = ggml_get_tensor_rank(tensor),
-                    .dimensions = dimensions,
-                    .memType = QNN_TENSORMEMTYPE_RAW,
-                    {.clientBuf = {.data = nullptr,
-                            .dataSize = 0}}}}
-    };
-    Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
-    if (nullptr == p_qnn_tensor) {
-        GGMLQNN_LOG_WARN("calloc failed");
-        return nullptr;
+//TODO:
+// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
+static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
+    switch (ggmltype) {
+        case GGML_TYPE_F16:
+            return QNN_DATATYPE_FLOAT_16;
+        case GGML_TYPE_F32:
+            return QNN_DATATYPE_FLOAT_32;
+        case GGML_TYPE_I8:
+            return QNN_DATATYPE_INT_8;
+        case GGML_TYPE_Q8_0:
+            return QNN_DATATYPE_SFIXED_POINT_8;
+        case GGML_TYPE_Q4_0:
+            return QNN_DATATYPE_SFIXED_POINT_4;
+        default:
+            break;
     }
-    error = deep_copy_qnn_tensors(qnn_tensor, * p_qnn_tensor);
-    if (error != QNN_SUCCESS) {
-        free(p_qnn_tensor);
-        GGMLQNN_LOG_WARN("init tensor failed");
-        return  nullptr;
+    return QNN_DATATYPE_UNDEFINED;
+}
+
+//TODO:
+static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
+    switch (qnn_type) {
+        case QNN_DATATYPE_FLOAT_32:
+            return GGML_TYPE_F32;
+        case QNN_DATATYPE_FLOAT_16:
+            return GGML_TYPE_F16;
+        case QNN_DATATYPE_UINT_32:
+        case QNN_DATATYPE_INT_32:
+            return GGML_TYPE_I32;
+        case QNN_DATATYPE_INT_16:
+            return GGML_TYPE_I16;
+        case QNN_DATATYPE_INT_8:
+            return GGML_TYPE_I8;
+        case QNN_DATATYPE_SFIXED_POINT_8:
+            return GGML_TYPE_Q8_0;
+        case QNN_DATATYPE_SFIXED_POINT_4:
+            return GGML_TYPE_Q4_0;
+        default:
+            break;
     }
+    return GGML_TYPE_COUNT;
+}
 
-    return p_qnn_tensor;
+//TODO: add more ops
+static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) {
+    switch (ggmlop) {
+        case GGML_OP_ADD:
+            return QNN_OP_ELEMENT_WISE_ADD;
+        case GGML_OP_MUL_MAT:
+            return QNN_OP_MAT_MUL;
+        default:
+            break;
+    }
+    return nullptr;
 }
 
-static Qnn_Tensor_t * ggml_qnn_create_mulmat_tensor(const ggml_tensor * tensor, const char * name, Qnn_TensorType_t qnn_tensor_type,
+static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor, const char * name, Qnn_TensorType_t qnn_tensor_type,
                                                     Qnn_DataType_t qnn_data_type, uint32_t rank, uint32_t * dims, void * data, uint32_t data_size) {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     char tensor_name[GGML_MAX_NAME] = {0};
@@ -1480,6 +1491,7 @@ static Qnn_Tensor_t * ggml_qnn_create_mulmat_tensor(const ggml_tensor * tensor,
         dimensions_transpose[3] = (uint32_t) tensor->ne[3];
         tensor_dims = dimensions_transpose;
     }
+    //re-assign tensor_dims
     if (nullptr != dims) {
         tensor_dims = dims;
     }
@@ -1522,66 +1534,25 @@ static Qnn_Tensor_t * ggml_qnn_create_mulmat_tensor(const ggml_tensor * tensor,
     return p_qnn_tensor;
 }
 
-//TODO:
-// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
-static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
-    switch (ggmltype) {
-        case GGML_TYPE_F16:
-            return QNN_DATATYPE_FLOAT_16;
-        case GGML_TYPE_F32:
-            return QNN_DATATYPE_FLOAT_32;
-        case GGML_TYPE_I8:
-            return QNN_DATATYPE_INT_8;
-        case GGML_TYPE_Q8_0:
-            return QNN_DATATYPE_SFIXED_POINT_8;
-        case GGML_TYPE_Q4_0:
-            return QNN_DATATYPE_SFIXED_POINT_4;
-        default:
-            break;
-    }
-    return QNN_DATATYPE_UNDEFINED;
-}
+static Qnn_Tensor_t * ggml_qnn_create_compute_tensor(const ggml_tensor * tensor) {
+    uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
+                             (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
+    Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
+    Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
 
-//TODO:
-static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
-    switch (qnn_type) {
-        case QNN_DATATYPE_FLOAT_32:
-            return GGML_TYPE_F32;
-        case QNN_DATATYPE_FLOAT_16:
-            return GGML_TYPE_F16;
-        case QNN_DATATYPE_UINT_32:
-        case QNN_DATATYPE_INT_32:
-            return GGML_TYPE_I32;
-        case QNN_DATATYPE_INT_16:
-            return GGML_TYPE_I16;
-        case QNN_DATATYPE_INT_8:
-            return GGML_TYPE_I8;
-        case QNN_DATATYPE_SFIXED_POINT_8:
-            return GGML_TYPE_Q8_0;
-        case QNN_DATATYPE_SFIXED_POINT_4:
-            return GGML_TYPE_Q4_0;
-        default:
-            break;
+    if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
+        qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+    } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
+        qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
     }
-    return GGML_TYPE_COUNT;
-}
 
-//TODO: add more ops
-static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) {
-    switch (ggmlop) {
-        case GGML_OP_ADD:
-            return QNN_OP_ELEMENT_WISE_ADD;
-        case GGML_OP_MUL_MAT:
-            return QNN_OP_MAT_MUL;
-        default:
-            break;
-    }
-    return nullptr;
-}
+    qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type);
+    Qnn_Tensor_t * p_qnn_tensor = ggml_qnn_create_general_tensor(tensor, nullptr,
+                                  qnn_tensor_type, qnn_data_type,
+                             ggml_n_dims(tensor), dimensions,
+                             nullptr, 0);
 
-static const char * get_ggml_type_name(ggml_type type) {
-    const auto * traits = ggml_get_type_traits(type);
-    return traits->type_name;
+    return p_qnn_tensor;
 }
 
 static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) {
@@ -1865,7 +1836,7 @@ class qnn_instance {
                        uint8_t do_node_validation = 1,
                        const QnnGraph_Config_t ** graph_configs = nullptr
     );
-    int init_qnn_graph(const std::string &graph_name, QNNBackend device, size_t vtcm_size_in_mb);
+    int init_qnn_graph(const std::string &graph_name, QNNBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
 
     int finalize_qnn_graph();
 
@@ -2813,7 +2784,7 @@ int qnn_instance::qnn_finalize() {
     return ret_status;
 }
 
-int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb) {
+int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb, size_t hvx_threads) {
     _graph_name = graph_name;
     _device_id = device;
 
@@ -2824,7 +2795,7 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend devi
     if (device == QNN_BACKEND_NPU) {
         QnnHtpGraph_CustomConfig_t hvx_config;
         hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
-        hvx_config.numHvxThreads = 8;
+        hvx_config.numHvxThreads = hvx_threads;
         QnnGraph_Config_t graph_hvx_config;
         graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
         graph_hvx_config.customConfig = &hvx_config;
@@ -2940,65 +2911,11 @@ static uint8_t * create_rpc_buffer(qnn_instance * instance, const ggml_tensor *
     return qnn_rpcbuffer;
 }
 
-static Qnn_ErrorHandle_t create_htp_graph(ggml_backend_qnn_context * ctx, const std::string & graph_name, Qnn_GraphHandle_t * graph_handle) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    if (nullptr == ctx)
-        return QNN_MIN_ERROR_COMMON;
-
-    qnn_instance * instance = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-
-    QnnHtpGraph_CustomConfig_t hvx_config;
-    hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
-    hvx_config.numHvxThreads = 4;
-    QnnGraph_Config_t graph_hvx_config;
-    graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-    graph_hvx_config.customConfig = &hvx_config;
-
-    QnnHtpGraph_CustomConfig_t dlbc_config;
-    dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
-    dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
-    dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
-    QnnGraph_Config_t graph_dlbc_config;
-    graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-    graph_dlbc_config.customConfig = &dlbc_config;
-
-    QnnHtpGraph_CustomConfig_t opt_config;
-    opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
-    opt_config.optimizationOption.floatValue = 3;    // 1 or 3
-    QnnGraph_Config_t graph_opt_config;
-    graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-    graph_opt_config.customConfig = &opt_config;
-
-    QnnHtpGraph_CustomConfig_t vtcm_config;
-    vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
-    vtcm_config.vtcmSizeInMB = ctx->socinfo.vtcm_size_in_mb;
-    QnnGraph_Config_t graph_vtcm_config;
-    graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-    graph_vtcm_config.customConfig = &vtcm_config;
-
-    QnnHtpGraph_CustomConfig_t precision_config;
-    precision_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
-    precision_config.precision = QNN_PRECISION_FLOAT16;
-    QnnGraph_Config_t graph_precision_config;
-    graph_precision_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-    graph_precision_config.customConfig = &precision_config;
-
-    const QnnGraph_Config_t * p_graphconfig[] = {&graph_hvx_config,
-                                                 &graph_dlbc_config,
-                                                 &graph_vtcm_config,
-                                                 &graph_opt_config,
-                                                 &graph_precision_config,
-                                                 NULL};
-    error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
-                                          graph_name.c_str(),
-                                          p_graphconfig, graph_handle);
-    return error;
-}
-
 static void print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     //skip sanity check of params
-    GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
+    if (nullptr != func_name && nullptr != ctx) {
+        GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
+    }
     GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
                       src0->name,
                       src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
@@ -3019,29 +2936,14 @@ static void print_tensors_info(const char * func_name, ggml_backend_qnn_context
 
 static void dump_tensors_info(const struct ggml_tensor * tensor) {
     //skip sanity check of params
-    struct ggml_tensor * src0 = tensor->src[0];
+    const struct ggml_tensor * src0 = tensor->src[0];
     struct ggml_tensor * src1 = tensor->src[1];
+    struct ggml_tensor * dst  = const_cast<ggml_tensor *>(tensor);
     GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op),
                       ggml_type_name(tensor->type));
     GGMLQNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type));
     GGMLQNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type));
-    GGMLQNN_LOG_DEBUG(
-            "src0 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-            src0->name,
-            src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
-            src0->nb[0], src0->nb[1], src0->nb[2]);
-    GGMLQNN_LOG_DEBUG(
-            "src1 %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-            src1->name,
-            src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
-            src1->nb[0], src1->nb[1], src1->nb[2]);
-    GGMLQNN_LOG_DEBUG(
-            "     %15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-            tensor->name,
-            tensor->type, ggml_type_name(tensor->type), tensor->ne[0], tensor->ne[1],
-            tensor->ne[2],
-            tensor->nb[0],
-            tensor->nb[1], tensor->nb[2]);
+    print_tensors_info(nullptr, nullptr, src0, src1, dst);
 }
 
 //TODO: currently only support offloading 2D matrix to QNN backend
@@ -3089,25 +2991,20 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     int64_t ne1 = tensor->ne[1];
 
     if (tensor->op == GGML_OP_ADD) {
+        //dump_tensors_info(tensor);
         if (!ggml_are_same_shape(src0, src1)) {
             return false;
         }
 
         if (ne00 < 32)
             return false;
-        
-#if GGMLQNN_PRINT_OP_ADD_LOG
-        //dump_tensors_info(tensor);
-#endif
+
         return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
                && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
-
     }
 
     if (tensor->op == GGML_OP_MUL_MAT) {
-#if GGMLQNN_PRINT_OP_MUL_MAT_LOG
-        //dump_tensors_info(tensor);
-#endif
+        dump_tensors_info(tensor);
         uint32_t src0_rank = ggml_get_tensor_rank(src0);
         uint32_t src1_rank = ggml_get_tensor_rank(src1);
 
@@ -3181,17 +3078,12 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
 
     if (!graph_initialized) {
         GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
-        if (ctx->device == QNN_BACKEND_NPU) {
-            error = create_htp_graph(ctx, graph_name, &graph_handle);
-        } else {
-            error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
-                                                  graph_name.c_str(),
-                                                  nullptr, &graph_handle);
-        }
+        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8);
         if (QNN_SUCCESS != error) {
             GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
             return;
         }
+        graph_handle = instance->get_qnn_graph_handle();
 
         if (enable_npu_rpc) {
             QNN_VER_PTR(*p_tensor0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
@@ -3391,9 +3283,9 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         p_param_tensor          = tensors[3];
         p_tensor2_transpose     = tensors[4];
     } else {
-        p_tensor0 = ggml_qnn_create_mulmat_tensor(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
-        p_tensor1 = ggml_qnn_create_mulmat_tensor(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
-        p_tensor2 = ggml_qnn_create_mulmat_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        p_tensor0 = ggml_qnn_create_general_tensor(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        p_tensor1 = ggml_qnn_create_general_tensor(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        p_tensor2 = ggml_qnn_create_general_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
     }
 
     print_tensors_info(__func__, ctx, src0, src1, dst);
@@ -3443,7 +3335,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         //step-2: create param tensor for mulmat of 2d matrix
         uint32_t param_tensor_dims[] = {2};
         uint32_t param_tensor_data[2] = {1, 0};
-        p_param_tensor = ggml_qnn_create_mulmat_tensor(nullptr, "param", QNN_TENSOR_TYPE_STATIC,QNN_DATATYPE_UINT_32, 1, param_tensor_dims, param_tensor_data, 8);
+        p_param_tensor = ggml_qnn_create_general_tensor(nullptr, "param", QNN_TENSOR_TYPE_STATIC,QNN_DATATYPE_UINT_32, 1, param_tensor_dims, param_tensor_data, 8);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
 
         //step-3: create compute tensor from ggml tensor
@@ -3457,8 +3349,8 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
 
         //step-4: create a transpose tensor
         uint32_t tensor2_transpose_dims[GGML_MAX_DIMS] = {};
-        p_tensor2_transpose = ggml_qnn_create_mulmat_tensor(dst,"transpose",QNN_TENSOR_TYPE_NATIVE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
-        get_qnn_dimensions_from_ggml_dimensions(tensor2_transpose_dims, tensor_2_dimensions,ggml_get_tensor_rank(dst));
+        p_tensor2_transpose = ggml_qnn_create_general_tensor(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        get_qnn_dimensions_from_ggml_dimensions(tensor2_transpose_dims, tensor_2_dimensions, ggml_get_tensor_rank(dst));
         //save pointer because the dimensions of tensor p_tensor2_transpose will be changed later
         uint32_t * tensor2_dimensions_transpose = QNN_VER_PTR(*p_tensor2_transpose)->dimensions;
         //update dimensions of tensor p_tensor2_transpose to make QNN SDK happy
@@ -3547,6 +3439,10 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         Qnn_Tensor_t tensor_outputs[] = {
                 *p_tensor2
         };
+        //attention:
+        // this is the second technical approach of "how to utilize the Hexagon NPU maximally" through
+        // QNN SDK, details could be found at
+        // https://github.com/kantv-ai/llama.cpp/wiki/mapping-ggml-compute-graph-to-QNN-compute-graph
         CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
                                               tensor_inputs, 2,
                                              tensor_outputs, 1,

From 73bd6eb0cbebb521c0c74487cfd8c8c048011bfb Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 20 Feb 2025 12:33:38 +0800
Subject: [PATCH 078/200] ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 282 ++++++++++++++++++++++-----------
 1 file changed, 186 insertions(+), 96 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 37c947f412f1f..ee273503b9e8a 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -13,9 +13,10 @@
  * section-5 does ggml-qnn backend helper macro / data structure / function / class
  * section-6 does implementation of ggml-qnn backend according to ggml's backend subsystem
  *
- * currently only provide OPs' QNN backend implementation of GGML_OP_ADD & GGML_OP_MUL_MAT:
+ * currently provide following ggml ops' QNN backend implementation:
  * - GGML_OP_ADD:    this is a simple skeleton, can expand other ggml ops according to expertise
- * - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex op accordingly
+ * - GGML_OP_MUL:    this is a simple skeleton, can expand other ggml ops according to expertise
+ * - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex ggml ops accordingly
  *
  * of course, can porting ggml-qnn to Windows on ARM as need.
  *
@@ -95,7 +96,6 @@
 #include "ggml-qnn.h"
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
-
 // =================================================================================================
 //  section-1: forward/external declaration
 // =================================================================================================
@@ -110,9 +110,9 @@ static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const
 // =================================================================================================
 #define GGMLQNN_DEBUG                           1  // for troubleshooting QNN backend
 #define GGML_QNN_LOGBUF_LEN                     4096
-#define ENABLE_QNNBACKEND_PERF                  1  // enable/disable op's perf info
+#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
 #define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
-#define GGMLQNN_PRINT_OP_ADD_LOG                1  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
+#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
 #define GGMLQNN_PRINT_OP_MUL_MAT_LOG            1
 
 #define GGMLQNN_LOG_ERROR(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG,  __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
@@ -1187,25 +1187,28 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .socinfo              = {}},
 };
 
-
 struct qnn_op_caps_t {
     const char * qnn_op_name = nullptr;
     const size_t input_param_count = 0;
     const char * qnn_param_name = nullptr;
 };
 
-static const qnn_op_caps_t kOpCaps[] = {
+static const qnn_op_caps_t k_op_caps[] = {
         {}, // GGML_OP_NONE
         {}, // GGML_OP_DUP
         {
                 // GGML_OP_ADD
-                QNN_OP_ELEMENT_WISE_ADD, // qnn_op_name
-                2,                   // input_param_count
+                QNN_OP_ELEMENT_WISE_ADD,
+                2,
         },
         {}, // GGML_OP_ADD1
         {}, // GGML_OP_ACC
         {}, // GGML_OP_SUB
-        {}, // GGML_OP_MUL
+        {
+                // GGML_OP_MUL
+                QNN_OP_ELEMENT_WISE_MULTIPLY,
+                2,
+        },
         {}, // GGML_OP_DIV
         {}, // GGML_OP_SQR
         {}, // GGML_OP_SQRT
@@ -1227,8 +1230,8 @@ static const qnn_op_caps_t kOpCaps[] = {
         {}, // GGML_OP_GROUP_NORM
         {
                 // GGML_OP_MUL_MAT
-                QNN_OP_MAT_MUL,  // qnn_op_name
-                2,               // input_param_count
+                QNN_OP_MAT_MUL,
+                2,
         },
         {}, // GGML_OP_MUL_MAT_ID
         {}, // GGML_OP_OUT_PROD
@@ -1580,11 +1583,9 @@ static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & o
     output.append(buffer, len);
 }
 
-constexpr const size_t kGgmlUnaryOpStart = GGML_OP_COUNT;
-
 static size_t get_qnn_op_index(const ggml_tensor * tensor) {
     if (tensor->op == GGML_OP_UNARY) {
-        return kGgmlUnaryOpStart + ggml_get_unary_op(tensor);
+        return GGML_OP_COUNT + ggml_get_unary_op(tensor);
     }
 
     return tensor->op;
@@ -1592,8 +1593,8 @@ static size_t get_qnn_op_index(const ggml_tensor * tensor) {
 
 static size_t get_qnn_op_input_param_count(const ggml_tensor * op) {
     auto op_index = get_qnn_op_index(op);
-    GGML_ASSERT(op_index < std::size(kOpCaps));
-    return kOpCaps[op_index].input_param_count;
+    GGML_ASSERT(op_index < std::size(k_op_caps));
+    return k_op_caps[op_index].input_param_count;
 }
 
 static void get_graph_key_from_op(const ggml_tensor * op, std::string & output) {
@@ -1796,21 +1797,21 @@ class qnn_instance {
 
     int qnn_finalize();
 
-    const qnn_interface &get_qnn_interface() {
+    const qnn_interface & get_qnn_interface() {
         if (!_qnn_interface.is_loaded()) {
             GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
         }
         return _qnn_interface;
     }
 
-    const QNN_INTERFACE_VER_TYPE &get_qnn_raw_interface() {
+    const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() {
         if (!_qnn_interface.is_loaded()) {
             GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
         }
         return _qnn_raw_interface;
     }
 
-    const QNN_SYSTEM_INTERFACE_VER_TYPE &get_qnn_raw_system_interface() {
+    const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() {
         if (!_qnn_interface.is_loaded()) {
             GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
         }
@@ -1836,7 +1837,7 @@ class qnn_instance {
                        uint8_t do_node_validation = 1,
                        const QnnGraph_Config_t ** graph_configs = nullptr
     );
-    int init_qnn_graph(const std::string &graph_name, QNNBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
+    int init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
 
     int finalize_qnn_graph();
 
@@ -1850,8 +1851,8 @@ class qnn_instance {
             return 1;
         }
 
-        QnnHtpDevice_Infrastructure_t *htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
-        QnnHtpDevice_PerfInfrastructure_t *htp_perfinfra = &htp_infra->perfInfra;
+        QnnHtpDevice_Infrastructure_t * htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
+        QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra;
         uint32_t power_configid = 1;
         uint32_t device_id = 0;
         uint32_t core_id = 0;
@@ -1925,6 +1926,7 @@ class qnn_instance {
     }
 
     size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
+    size_t get_rpcmem_usage() { return _rpcmem_usage; }
 
     int32_t rpcmem_to_fd(void * buf);
 
@@ -1950,6 +1952,32 @@ class qnn_instance {
         return _enable_qnn_rpc;
     }
 
+    void probe_device_meminfo() {
+        size_t candidate_size = 0;
+        uint8_t *rpc_buffer = nullptr;
+        const int SIZE_IN_MB = (1 << 20);
+        size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
+        size_t probe_counts = sizeof(probe_slots) / sizeof(size_t);
+        for (size_t idx = 0; idx < probe_counts; idx++) {
+            rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem_internal(probe_slots[idx] * SIZE_IN_MB, 4));
+            if (nullptr == rpc_buffer) {
+                GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx],
+                                  strerror(errno));
+                break;
+            } else {
+                candidate_size = probe_slots[idx];
+                free_rpcmem(rpc_buffer);
+                rpc_buffer = nullptr;
+            }
+        }
+        if (candidate_size > _rpcmem_capacity)
+            _rpcmem_capacity = candidate_size;
+
+        free_rpcmem();
+        _rpcmem_usage = 0;
+        GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
+    }
+
 public:
     std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>> _qnn_graph_map;
 
@@ -1969,6 +1997,8 @@ class qnn_instance {
     void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) {
         _qnn_raw_system_interface = raw_interface;
     }
+    
+    void * alloc_rpcmem_internal(size_t bytes, size_t alignment);
 
 private:
     static constexpr const int _required_num_providers = 1;
@@ -1987,7 +2017,7 @@ class qnn_instance {
 
     qnn_interface _qnn_interface;
 
-    void *_system_lib_handle = nullptr;
+    void * _system_lib_handle = nullptr;
 
     Qnn_GraphHandle_t _qnn_graph_handle = nullptr;
 
@@ -2013,7 +2043,6 @@ class qnn_instance {
     std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
     std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
 
-
     static std::mutex _init_mutex;
     static std::unordered_map<BackendIdType, void *> _loaded_lib_handle;
     static std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
@@ -2027,7 +2056,9 @@ class qnn_instance {
     pfn_rpc_mem_init  _pfn_rpc_mem_init;
     pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
     std::unordered_map<void *, void *> _rpcmem_store_map;
-    size_t                             _rpcmem_capacity = 512;
+    std::unordered_map<void *, size_t> _rpcmem_usage_map;
+    size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
+    size_t                             _rpcmem_usage    = 0;   // mempool usage in MBytes
 
     std::string _graph_name;
     QNNBackend _device_id;
@@ -2042,7 +2073,7 @@ std::unordered_map<qnn_instance::BackendIdType, void *> qnn_instance::_loaded_li
 std::unordered_map<std::string, qnn_instance::BackendIdType> qnn_instance::_lib_path_to_backend_id;
 std::unordered_map<qnn_instance::BackendIdType, const QnnInterface_t *> qnn_instance::_loaded_backend;
 
-void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
+void * qnn_instance::alloc_rpcmem_internal(size_t bytes, size_t alignment) {
     if (!_rpcmem_initialized) {
         GGMLQNN_LOG_WARN("rpc memory not initialized\n");
         return nullptr;
@@ -2062,17 +2093,50 @@ void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
         GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
         _pfn_rpc_mem_free(buf);
     }
+    return aligned_buf;
+}
+
+void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
+    if (_rpcmem_usage > (_rpcmem_capacity - 8)) { // reserve 8Mbytes in rpc mempool
+        GGMLQNN_LOG_WARN("rpc mempool capcaity: %d MB, usage: %d MB", _rpcmem_capacity, _rpcmem_usage);
+        return nullptr;
+    }
+
+    auto aligned_buf = alloc_rpcmem_internal(bytes, alignment);
+    if (nullptr == aligned_buf)
+        return nullptr;
+    _rpcmem_usage_map.insert(std::pair<void *, size_t>(aligned_buf, bytes));
 
+    size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
+    rpcmem_usage_in_bytes += bytes;
+    _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
     return aligned_buf;
 }
 
 void qnn_instance::free_rpcmem(void * buf) {
+    size_t rpcbuffer_size = 0;
     if (!_rpcmem_initialized) {
         GGMLQNN_LOG_WARN("rpc memory not initialized\n");
     } else if (0 == _rpcmem_store_map.count(buf)) {
         GGMLQNN_LOG_WARN("no allocated tensor\n");
     } else {
         GGMLQNN_LOG_DEBUG("free rpc mem %p", _rpcmem_store_map[buf]);
+        for (std::unordered_map<void *, size_t>::iterator it = _rpcmem_usage_map.begin();
+             it != _rpcmem_usage_map.end();
+             it++) {
+            void * rpcbuffer = it->first;
+            if (buf == rpcbuffer) {
+                rpcbuffer_size = it->second;
+                size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
+                rpcmem_usage_in_bytes -= rpcbuffer_size;
+                _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
+            }
+        }
+        if (rpcbuffer_size != 0) {
+            _rpcmem_usage_map.erase(buf);
+        } else {
+            GGMLQNN_LOG_WARN("it shouldn't happen, pls check why?");
+        }
         _pfn_rpc_mem_free(_rpcmem_store_map[buf]);
         _rpcmem_store_map.erase(buf);
     }
@@ -2094,6 +2158,8 @@ void qnn_instance::free_rpcmem() {
         _pfn_rpc_mem_free(rpcbuffer);
     }
     _rpcmem_store_map.clear();
+    _rpcmem_usage_map.clear();
+    _rpcmem_usage = 0;
 }
 
 int32_t qnn_instance::rpcmem_to_fd(void * buf) {
@@ -2177,7 +2243,11 @@ Qnn_MemHandle_t  qnn_instance::register_rpcmem(void * p_data, const uint32_t ran
     }
 
     GGMLQNN_LOG_DEBUG("mem_fd %d", mem_fd);
-    Qnn_MemDescriptor_t descriptor = {{rank, dimensions, nullptr}, data_type, QNN_MEM_TYPE_ION, {{mem_fd}}};
+    Qnn_MemDescriptor_t descriptor = {
+            {rank, dimensions, nullptr},
+            data_type, QNN_MEM_TYPE_ION,
+            {{mem_fd}}
+    };
     Qnn_MemHandle_t handle = nullptr;
     auto error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor,
             /*numDescriptors=*/1, &handle);
@@ -2318,7 +2388,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
     _loaded_lib_handle[backend_id] = lib_handle;
     _backend_id = backend_id;
 
-#if 0 // keep them here for further use
+#if 0 // leave them here for further use
     QnnSaver_Config_t outputdir_cfg;
     outputdir_cfg.option = QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY;
     outputdir_cfg.outputDirectory = "/data/local/tmp/";
@@ -2468,6 +2538,7 @@ int qnn_instance::unload_system() {
     return result;
 }
 
+#if GGMLQNN_PRINT_QNN_INTERNAL_LOG
 static void ggml_qnn_logcallback(const char * fmt,
                                  QnnLog_Level_t level,
                                  uint64_t timestamp,
@@ -2499,24 +2570,25 @@ static void ggml_qnn_logcallback(const char * fmt,
     }
 
     double ms = (double) timestamp / 1000000.0;
-
     {
         std::lock_guard<std::mutex> lock(log_mutex);
-
         memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN);
         vsnprintf(reinterpret_cast<char *const>(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp);
-#if GGMLQNN_PRINT_QNN_INTERNAL_LOG
         GGMLQNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
-#endif
     }
 }
+#else
+static void ggml_qnn_logcallback(const char * fmt,
+                                 QnnLog_Level_t level,
+                                 uint64_t timestamp,
+                                 va_list argp) {
+}
+#endif
 
 int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
     BackendIdType backend_id = QNN_BACKEND_ID_NULL;
     GGMLQNN_LOG_DEBUG("enter qni_init\n");
-
     const std::lock_guard<std::mutex> lock(_init_mutex);
-
     if (0 != load_system()) {
         GGMLQNN_LOG_WARN("can not load QNN system lib, pls check why?\n");
         return 1;
@@ -2542,9 +2614,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
               _loaded_lib_handle.count(backend_id));
         return 3;
     }
-
     _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]);
-
 #if 1
     _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle);
 #else
@@ -2671,25 +2741,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         }
         _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
 
-        size_t candidate_size = 0;
-        uint8_t * rpc_buffer = nullptr;
-        const int SIZE_IN_MB = (1 << 20);
-        size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
-        size_t probe_counts  = sizeof(probe_slots) / sizeof(size_t);
-        for (size_t idx = 0; idx < probe_counts; idx++) {
-            rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem(probe_slots[idx] * SIZE_IN_MB, 4));
-            if (nullptr == rpc_buffer) {
-                GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
-                break;
-            } else {
-                candidate_size = probe_slots[idx];
-                free_rpcmem(rpc_buffer);
-                rpc_buffer = nullptr;
-            }
-        }
-        if (candidate_size > _rpcmem_capacity)
-            _rpcmem_capacity = candidate_size;
-        GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
+        probe_device_meminfo();
 
         if (0 != init_htp_perfinfra()) {
             GGMLQNN_LOG_WARN("initialize HTP performance failure");
@@ -2963,6 +3015,7 @@ static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, u
 // =================================================================================================
 //  section-6: implementation of ggml-qnn backend
 // =================================================================================================
+//TODO: refine this function as it is a performance hotspot/bottleneck function
 static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     if (tensor->op == GGML_OP_NONE) {
         return true;
@@ -2973,7 +3026,9 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
         return false;
     }
 
-    bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT));
+    //TODO: support other op
+    bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT)
+            || (tensor->op == GGML_OP_MUL));
     if (!supported_op) {
         return false;
     }
@@ -2981,37 +3036,34 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     struct ggml_tensor * src0 = tensor->src[0];
     struct ggml_tensor * src1 = tensor->src[1];
 
-    int64_t ne00 = tensor->src[0]->ne[0];
-    int64_t ne01 = tensor->src[0]->ne[1];
+    const int64_t ne00 = tensor->src[0]->ne[0];
+    const int64_t ne01 = tensor->src[0]->ne[1];
 
-    int64_t ne10 = tensor->src[1]->ne[0];
-    int64_t ne11 = tensor->src[1]->ne[1];
+    const int64_t ne10 = tensor->src[1]->ne[0];
+    const int64_t ne11 = tensor->src[1]->ne[1];
 
-    int64_t ne0 = tensor->ne[0];
-    int64_t ne1 = tensor->ne[1];
+    const int64_t ne0 = tensor->ne[0];
+    const int64_t ne1 = tensor->ne[1];
+
+    const uint32_t src0_rank = ggml_get_tensor_rank(src0);
+    const uint32_t src1_rank = ggml_get_tensor_rank(src1);
 
     if (tensor->op == GGML_OP_ADD) {
         //dump_tensors_info(tensor);
         if (!ggml_are_same_shape(src0, src1)) {
             return false;
         }
-
         if (ne00 < 32)
             return false;
-
         return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
                && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
     }
 
     if (tensor->op == GGML_OP_MUL_MAT) {
-        dump_tensors_info(tensor);
-        uint32_t src0_rank = ggml_get_tensor_rank(src0);
-        uint32_t src1_rank = ggml_get_tensor_rank(src1);
-
+        //dump_tensors_info(tensor);
         if ((src0_rank != 2) || (src1_rank != 2)) //TODO: only support offload 2D matrix mulmat to QNN backend
             return false;
-
-        //TODO: support more data type in func ggml_qnn_mul_mat(...):
+        //TODO: support more data type in func ggml_qnn_mul_mat(...)
         //src0: q4_0, q6_k, ...
         //src1: f32
         //dst : f32
@@ -3020,19 +3072,30 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
                 && (src0->type == src1->type) && (src0->type == tensor->type);
     }
 
-    //TODO:for other op
+    if (tensor->op == GGML_OP_MUL) {
+        dump_tensors_info(tensor);
+        if ((src0_rank != 2) || (src1_rank != 2)) //TODO: only support offload 2D matrix mul to QNN backend
+            return false;
+        return  (src0->type == GGML_TYPE_F32)
+                && (src1->type == GGML_TYPE_F32)
+                && (tensor->type == src1->type);
+    }
+
     return  (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
             && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
             && (src0->type == src1->type) && (src0->type == tensor->type);
 }
 
-static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
+/*
+ * provide a general skeleton to offload ggml op to QNN backend: a single node contains 2 input
+ * tensor and 1 output tensor
+*/
+static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
     Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
     enum ggml_status result                     = GGML_STATUS_SUCCESS;
     bool graph_initialized                      = false;
     qnn_instance * instance                     = nullptr;
     ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *)backend->context;
-    qnn_perf op_perf                            = qnn_perf("ggml_qnn_add");
     Qnn_GraphHandle_t graph_handle              = nullptr;
     Qnn_Tensor_t * p_tensor0                    = nullptr;
     Qnn_Tensor_t * p_tensor1                    = nullptr;
@@ -3045,6 +3108,14 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
     GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
     instance                                    = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+
+    size_t qnn_op_index = get_qnn_op_index(op);
+    GGML_ASSERT(qnn_op_index < std::size(k_op_caps));
+    const char * qnn_op_name = k_op_caps[qnn_op_index].qnn_op_name;
+    std::string ggml_op_name_string = std::string("ggml_") + ggml_op_name(op->op);
+    const char * ggml_op_name = ggml_op_name_string.c_str();
+
+    qnn_perf op_perf                            = qnn_perf(ggml_op_name);
     op_perf.start();
 
     std::string graph_name;
@@ -3124,9 +3195,9 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
         };
         Qnn_OpConfig_t op_config = {
                 QNN_OPCONFIG_VERSION_1, .v1 = {
-                        "ggml_op_add",
+                        ggml_op_name,
                         QNN_OP_PACKAGE_NAME_QTI_AISW,
-                        QNN_OP_ELEMENT_WISE_ADD,
+                        qnn_op_name,
                         0,
                         qnn_params,
                         2,
@@ -3138,9 +3209,9 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
         CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                               tensor_inputs, 2,
-                                               tensor_outputs, 1,
-                                               nullptr, nullptr));
+                                                            tensor_inputs, 2,
+                                                            tensor_outputs, 1,
+                                                            nullptr, nullptr));
 
         if (enable_npu_rpc) {
             uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
@@ -3214,9 +3285,9 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
                 *p_tensor2
         };
         CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                               tensor_inputs, 2,
-                                               tensor_outputs, 1,
-                                               nullptr, nullptr));
+                                                            tensor_inputs, 2,
+                                                            tensor_outputs, 1,
+                                                            nullptr, nullptr));
 
         if (enable_npu_rpc) {
             //TODO:NPU RPC feature will failed with test-backend-ops
@@ -3231,18 +3302,17 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
     QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
     QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
     QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
-#if GGMLQNN_PRINT_OP_ADD_LOG
     op_perf.info();
-#endif
 }
 
 /*
- * the logic of ggml_qnn_mul_mat is similar to ggml_qnn_add but much more complicated than ggml_qnn_add,
+ * the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
+ * than ggml_qnn_general_node.
  * matrix transpose and type trait are required for offload mulmat to QNN backend,
  * so it's a standalone function. accordingly, this is another typical skeleton for offload other
  * ggml ops to QNN backend
  *
- * MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, we should focus on MUL_MAT.
+ * MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, should focus on MUL_MAT.
  *
  * have three kinds of MUL_MAT to compute:
  * mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
@@ -3288,7 +3358,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         p_tensor2 = ggml_qnn_create_general_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
     }
 
-    print_tensors_info(__func__, ctx, src0, src1, dst);
+    //print_tensors_info(__func__, ctx, src0, src1, dst);
 
     //ensure QNN tensor has correct tensor type
     QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
@@ -3444,8 +3514,8 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         // QNN SDK, details could be found at
         // https://github.com/kantv-ai/llama.cpp/wiki/mapping-ggml-compute-graph-to-QNN-compute-graph
         CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                              tensor_inputs, 2,
-                                             tensor_outputs, 1,
+                                         tensor_inputs, 2,
+                                         tensor_outputs, 1,
                                          nullptr, nullptr));
     }
 
@@ -3453,7 +3523,6 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
     QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
     QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
     QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
-
     op_perf.info();
 }
 
@@ -3462,13 +3531,17 @@ static bool ggml_qnn_compute_forward(ggml_backend_t backend, struct ggml_tensor
 
     switch (tensor->op) {
         case GGML_OP_ADD:
-            func = ggml_qnn_add;
+            func = ggml_qnn_general_node;
             break;
 
         case GGML_OP_MUL_MAT:
             func = ggml_qnn_mul_mat;
             break;
 
+        case GGML_OP_MUL:
+            func = ggml_qnn_general_node;
+            break;
+
         default:
             return false;
     }
@@ -3667,7 +3740,6 @@ static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, s
     ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *) backend->context;
     GGML_UNUSED(ctx);
 
-    GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
         if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE
@@ -3715,10 +3787,28 @@ static const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t d
 }
 
 static void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    //FIXME:this is NOT QNN device memory info
-    *free  = get_system_free_memory_in_bytes();
-    *total = get_system_total_memory_in_bytes();
-    GGML_UNUSED(dev);
+    struct ggml_backend_qnn_context * ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
+    if ((nullptr == ctx) || (ctx->device > QNN_BACKEND_GGML)) {
+        GGMLQNN_LOG_ERROR("pls check params");
+        *free = 0;
+        *total = 0;
+    }
+
+    if (QNN_BACKEND_CPU == ctx->device || QNN_BACKEND_GGML == ctx->device) {
+        *total = get_system_total_memory_in_bytes();
+        *free = get_system_free_memory_in_bytes();
+    } else if (QNN_BACKEND_GPU == ctx->device) {
+        //TODO: probe GPU info in Qualcomm Adreno GPU
+        *total = get_system_total_memory_in_bytes();
+        *free = get_system_free_memory_in_bytes();
+    } else if (QNN_BACKEND_NPU == ctx->device) {
+        size_t rpc_ion_memsize = ctx->instance->get_rpcmem_capacity();
+        size_t rpc_ion_usage = ctx->instance->get_rpcmem_usage();
+        GGMLQNN_LOG_DEBUG("rpc memsize %d", rpc_ion_memsize);
+        GGMLQNN_LOG_DEBUG("rpc usage %d", rpc_ion_usage);
+        *total = rpc_ion_memsize * (1 << 20);
+        *free = (rpc_ion_memsize - rpc_ion_usage) * (1 << 20);
+    }
 }
 
 static enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) {

From 340dc4a0c0d0468bd0a8f8ebd087c892d69d648e Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 20 Feb 2025 22:20:15 +0800
Subject: [PATCH 079/200] ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 106 +++++++++++++++++++--------------
 1 file changed, 60 insertions(+), 46 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index ee273503b9e8a..9ef502421c051 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -1483,15 +1483,13 @@ static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor,
     GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
     inc_idx();
 
-    //there are different dimension order between ggml tensor and qnn tensor
     uint32_t dimensions_transpose[GGML_MAX_DIMS] = {};
     uint32_t * tensor_dims = nullptr;
-
     if (nullptr != tensor) {
-        dimensions_transpose[0] = (uint32_t) tensor->ne[1];
-        dimensions_transpose[1] = (uint32_t) tensor->ne[0];
-        dimensions_transpose[2] = (uint32_t) tensor->ne[2];
-        dimensions_transpose[3] = (uint32_t) tensor->ne[3];
+        //there are different dimension order between ggml tensor and qnn tensor
+        for (size_t idx = 0; idx < rank; idx++) {
+            dimensions_transpose[idx] = (uint32_t)tensor->ne[rank - 1 - idx];
+        }
         tensor_dims = dimensions_transpose;
     }
     //re-assign tensor_dims
@@ -2058,7 +2056,7 @@ class qnn_instance {
     std::unordered_map<void *, void *> _rpcmem_store_map;
     std::unordered_map<void *, size_t> _rpcmem_usage_map;
     size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
-    size_t                             _rpcmem_usage    = 0;   // mempool usage in MBytes
+    size_t                             _rpcmem_usage    = 0;   // mempool usage in Mbytes
 
     std::string _graph_name;
     QNNBackend _device_id;
@@ -2968,33 +2966,27 @@ static void print_tensors_info(const char * func_name, ggml_backend_qnn_context
     if (nullptr != func_name && nullptr != ctx) {
         GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
     }
-    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
                       src0->name,
-                      src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
-                      src0->nb[0], src0->nb[1], src0->nb[2]);
-    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                      src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+                      src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
+    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
                       src1->name,
-                      src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
-                      src1->nb[0], src1->nb[1], src1->nb[2]);
-    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+                      src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
+                      src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
+    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
                       dst->name,
-                      dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
-                      dst->nb[1], dst->nb[2]);
-    GGMLQNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
-    GGMLQNN_LOG_DEBUG("tensor0 name %s", src0->name);
-    GGMLQNN_LOG_DEBUG("tensor1 name %s", src1->name);
-    GGMLQNN_LOG_DEBUG("tensor2 name %s", dst->name);
+                      dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+                      dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
+    GGMLQNN_LOG_DEBUG("\n");
 }
 
-static void dump_tensors_info(const struct ggml_tensor * tensor) {
+static void dump_op_info(const struct ggml_tensor * tensor) {
     //skip sanity check of params
     const struct ggml_tensor * src0 = tensor->src[0];
-    struct ggml_tensor * src1 = tensor->src[1];
-    struct ggml_tensor * dst  = const_cast<ggml_tensor *>(tensor);
-    GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op),
-                      ggml_type_name(tensor->type));
-    GGMLQNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type));
-    GGMLQNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type));
+    struct ggml_tensor       * src1 = tensor->src[1];
+    struct ggml_tensor       * dst  = const_cast<ggml_tensor *>(tensor);
+    GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), ggml_type_name(tensor->type));
     print_tensors_info(nullptr, nullptr, src0, src1, dst);
 }
 
@@ -3008,8 +3000,13 @@ static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, u
         GGMLQNN_LOG_WARN("invalid params");
         return;
     }
-    qnn_dimensions[0] = ggml_dimensions[1];
-    qnn_dimensions[1] = ggml_dimensions[0];
+    for (size_t idx = 0; idx < GGML_MAX_DIMS; idx++)
+        qnn_dimensions[idx] = ggml_dimensions[idx];
+
+    if (rank >= 2) {
+        qnn_dimensions[rank - 1] = ggml_dimensions[rank - 2];
+        qnn_dimensions[rank - 2] = ggml_dimensions[rank - 1];
+    }
 }
 
 // =================================================================================================
@@ -3060,9 +3057,16 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     }
 
     if (tensor->op == GGML_OP_MUL_MAT) {
-        //dump_tensors_info(tensor);
-        if ((src0_rank != 2) || (src1_rank != 2)) //TODO: only support offload 2D matrix mulmat to QNN backend
+        dump_op_info(tensor);
+        if (src0_rank != src1_rank) // make QNN SDK happy
+            return false;
+        if (src0_rank < 2) // make QNN SDK happy
+            return false;
+        if (src0_rank > 3) //TODO: 4D matrix
             return false;
+        if ((src1->ne[2] != src0->ne[2]) || (src1->ne[3] != src0->ne[3])) // make QNN SDK happy
+            return false;
+
         //TODO: support more data type in func ggml_qnn_mul_mat(...)
         //src0: q4_0, q6_k, ...
         //src1: f32
@@ -3073,8 +3077,8 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     }
 
     if (tensor->op == GGML_OP_MUL) {
-        dump_tensors_info(tensor);
-        if ((src0_rank != 2) || (src1_rank != 2)) //TODO: only support offload 2D matrix mul to QNN backend
+        //dump_tensors_info(tensor);
+        if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix
             return false;
         return  (src0->type == GGML_TYPE_F32)
                 && (src1->type == GGML_TYPE_F32)
@@ -3340,6 +3344,11 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
     op_perf.start();
 
+    uint32_t src0_rank = ggml_get_tensor_rank(src0);
+    uint32_t src1_rank = ggml_get_tensor_rank(src1);
+    GGML_ASSERT(src0_rank == src1_rank);
+    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation
+
     std::string graph_name;
     get_graph_key_from_op(op, graph_name);
     if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
@@ -3353,12 +3362,12 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         p_param_tensor          = tensors[3];
         p_tensor2_transpose     = tensors[4];
     } else {
-        p_tensor0 = ggml_qnn_create_general_tensor(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
-        p_tensor1 = ggml_qnn_create_general_tensor(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
-        p_tensor2 = ggml_qnn_create_general_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        p_tensor0 = ggml_qnn_create_general_tensor(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor1 = ggml_qnn_create_general_tensor(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor2 = ggml_qnn_create_general_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
     }
 
-    //print_tensors_info(__func__, ctx, src0, src1, dst);
+    print_tensors_info(__func__, ctx, src0, src1, dst);
 
     //ensure QNN tensor has correct tensor type
     QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
@@ -3403,9 +3412,16 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
             return;
         }
         //step-2: create param tensor for mulmat of 2d matrix
-        uint32_t param_tensor_dims[] = {2};
-        uint32_t param_tensor_data[2] = {1, 0};
-        p_param_tensor = ggml_qnn_create_general_tensor(nullptr, "param", QNN_TENSOR_TYPE_STATIC,QNN_DATATYPE_UINT_32, 1, param_tensor_dims, param_tensor_data, 8);
+        const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
+                {0},
+                {1, 0},
+                {0, 2, 1},
+                {0, 1, 3, 2},
+        };
+        uint32_t param_tensor_dims[1]   = {src0_rank};
+        p_param_tensor = ggml_qnn_create_general_tensor(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32,
+                                                        1, param_tensor_dims,
+                                                        (void *) (param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
 
         //step-3: create compute tensor from ggml tensor
@@ -3419,7 +3435,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
 
         //step-4: create a transpose tensor
         uint32_t tensor2_transpose_dims[GGML_MAX_DIMS] = {};
-        p_tensor2_transpose = ggml_qnn_create_general_tensor(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
+        p_tensor2_transpose = ggml_qnn_create_general_tensor(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
         get_qnn_dimensions_from_ggml_dimensions(tensor2_transpose_dims, tensor_2_dimensions, ggml_get_tensor_rank(dst));
         //save pointer because the dimensions of tensor p_tensor2_transpose will be changed later
         uint32_t * tensor2_dimensions_transpose = QNN_VER_PTR(*p_tensor2_transpose)->dimensions;
@@ -3435,7 +3451,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
                 }
         };
 
-        Qnn_Tensor_t out_0_inputs[] = {*p_tensor0,*p_tensor1};
+        Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0,*p_tensor1};
         Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
         Qnn_OpConfig_t out_0 = {
                 QNN_OPCONFIG_VERSION_1, .v1 =
@@ -3455,7 +3471,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
                  "perm", .tensorParam = *p_param_tensor
                 }
         };
-        Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
+        Qnn_Tensor_t out_trans1_0_inputs[]  = {*p_tensor2_transpose};
         Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
         Qnn_OpConfig_t out_trans1_0 = {
                 QNN_OPCONFIG_VERSION_1,
@@ -3472,7 +3488,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
 
         //step-6: finalize qnn graph and execute qnn graph
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
-        Qnn_Tensor_t input_tensors_0[] = {*p_tensor0,*p_tensor1};
+        Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0,*p_tensor1};
         Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
         CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
                                                input_tensors_0, 2,
@@ -3495,9 +3511,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         //restore pointer to avoid memory leak
         QNN_VER_PTR(*p_tensor2_transpose)->dimensions = tensor2_dimensions_transpose;
         //free_qnn_tensor(p_tensor2_transpose);
-
     } else {
-
         QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
         QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
         QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};

From fe181b806874e707fe41f47aa86b8606186b2cea Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Fri, 21 Feb 2025 17:43:25 +0800
Subject: [PATCH 080/200] ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 9ef502421c051..e862b07a234eb 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -1132,8 +1132,8 @@ struct ggml_backend_qnn_context {
     struct qcom_socinfo           socinfo;
 } ;
 
-//TODO: the following global vars and three helper funcs should be removed in the future
-static int32_t  g_ggmltensor_idx    = 0;
+//the following helper funcs are used to ensure every QNN tensor name is unique
+static std::atomic<int32_t>  g_ggmltensor_idx(0);
 static void reset_idx() {
     g_ggmltensor_idx = 0;
 }
@@ -1143,7 +1143,7 @@ static void inc_idx() {
 }
 
 static int32_t get_idx() {
-    return g_ggmltensor_idx;
+    return g_ggmltensor_idx.load();
 }
 
 // file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/quantization.html
@@ -1474,7 +1474,7 @@ static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor,
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     char tensor_name[GGML_MAX_NAME] = {0};
 
-    //TODO:remove get_idx() and inc_idx() in the future but ensure the tensor name is unique
+    //ensure the tensor name is unique
     if (nullptr != name) {
         snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", get_idx());
     } else {
@@ -2762,7 +2762,6 @@ int qnn_instance::qnn_finalize() {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
     GGMLQNN_LOG_DEBUG("enter %s\n", __func__);
-    //TODO:should be removed in the future
     reset_idx();
 
     free_rpcmem();
@@ -3451,7 +3450,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
                 }
         };
 
-        Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0,*p_tensor1};
+        Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0, *p_tensor1};
         Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
         Qnn_OpConfig_t out_0 = {
                 QNN_OPCONFIG_VERSION_1, .v1 =
@@ -3488,7 +3487,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
 
         //step-6: finalize qnn graph and execute qnn graph
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
-        Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0,*p_tensor1};
+        Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0, *p_tensor1};
         Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
         CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
                                                input_tensors_0, 2,

From ac59ddabea9d6fd780992248c4e92e63983973aa Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 23 Feb 2025 10:23:03 +0800
Subject: [PATCH 081/200] ggml-qnn: fix a minior typo in internal doc

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index e862b07a234eb..effcd5d54648f 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -3410,7 +3410,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
             GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
             return;
         }
-        //step-2: create param tensor for mulmat of 2d matrix
+        //step-2: create param tensor for mulmat of 2d/3d/4d matrix
         const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
                 {0},
                 {1, 0},

From a8effb535c317042cd42b6fbd3997d231c6211fb Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 23 Feb 2025 22:41:31 +0800
Subject: [PATCH 082/200] ggml-qnn: refine function
 ggml_qnn_create_general_tensor() to avoid complex/redundant pointer operation

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 141 ++++++++++++++++++++-------------
 1 file changed, 87 insertions(+), 54 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index effcd5d54648f..1b1e280f09505 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -104,6 +104,12 @@ struct ggml_backend_qnn_context;
 static int free_qnn_tensor(Qnn_Tensor_t * tensor);
 static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph);
 static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
+static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
+                                                     Qnn_TensorType_t qnn_tensor_type,
+                                                     Qnn_DataType_t qnn_data_type,
+                                                     uint32_t rank, uint32_t * dims,
+                                                     void * data, uint32_t data_size,
+                                                     bool b_transpose = false);
 
 // =================================================================================================
 //  section-2: ggml-qnn internal troubleshooting function
@@ -163,6 +169,7 @@ static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const
 
 #define GGMLQNN_MEM_ADD(alignment)              (sizeof (size_t) + alignment)
 #define GGMLQNN_MEM_MASK(alignment)             ((uintptr_t)alignment - 1)
+#define GQCGT                                   ggml_qnn_create_general_tensor
 
 static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
     return offset % alignment == 0 ? offset
@@ -1013,6 +1020,20 @@ static const char * qnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
     }
 }
 
+// helper function to create an operation config
+static Qnn_OpConfig_t create_op_config(const char * name, const char * package, const char * type,
+                                       Qnn_Param_t * params, uint32_t num_params,
+                                       Qnn_Tensor_t * inputs, uint32_t num_inputs,
+                                       Qnn_Tensor_t * outputs, uint32_t num_outputs) {
+    Qnn_OpConfigV1_t v1 = {name, package, type,
+                           num_params, params,
+                           num_inputs, inputs,
+                           num_outputs, outputs
+    };
+
+    return (Qnn_OpConfig_t){QNN_OPCONFIG_VERSION_1, .v1 = v1};
+}
+
 // =================================================================================================
 //  section-5:ggml-qnn backend helper macro / data structure / function / class
 // =================================================================================================
@@ -1469,10 +1490,32 @@ static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) {
     return nullptr;
 }
 
-static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor, const char * name, Qnn_TensorType_t qnn_tensor_type,
-                                                    Qnn_DataType_t qnn_data_type, uint32_t rank, uint32_t * dims, void * data, uint32_t data_size) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    char tensor_name[GGML_MAX_NAME] = {0};
+static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, const uint32_t * ggml_dimensions, uint32_t rank) {
+    if (rank > GGML_MAX_DIMS) {
+        GGMLQNN_LOG_WARN("invalid params");
+        return;
+    }
+    if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) {
+        GGMLQNN_LOG_WARN("invalid params");
+        return;
+    }
+    for (size_t idx = 0; idx < GGML_MAX_DIMS; idx++)
+        qnn_dimensions[idx] = ggml_dimensions[idx];
+
+    if (rank >= 2) {
+        qnn_dimensions[rank - 1] = ggml_dimensions[rank - 2];
+        qnn_dimensions[rank - 2] = ggml_dimensions[rank - 1];
+    }
+}
+
+static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
+                                                     Qnn_TensorType_t qnn_tensor_type,
+                                                     Qnn_DataType_t qnn_data_type,
+                                                     uint32_t rank, uint32_t * dims,
+                                                     void * data, uint32_t data_size,
+                                                     bool b_transpose) {
+    Qnn_ErrorHandle_t error         = QNN_SUCCESS;
+    char tensor_name[GGML_MAX_NAME] = {};
 
     //ensure the tensor name is unique
     if (nullptr != name) {
@@ -1483,19 +1526,36 @@ static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor,
     GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
     inc_idx();
 
-    uint32_t dimensions_transpose[GGML_MAX_DIMS] = {};
-    uint32_t * tensor_dims = nullptr;
+    uint32_t reverse_dims[GGML_MAX_DIMS]    = {};
+    uint32_t transpose_dims[GGML_MAX_DIMS]  = {};
+    uint32_t * tensor_dims                  = nullptr;
+    //case 1:use dims info from ggml tensor
     if (nullptr != tensor) {
         //there are different dimension order between ggml tensor and qnn tensor
         for (size_t idx = 0; idx < rank; idx++) {
-            dimensions_transpose[idx] = (uint32_t)tensor->ne[rank - 1 - idx];
+            reverse_dims[idx] = (uint32_t)tensor->ne[rank - 1 - idx];
         }
-        tensor_dims = dimensions_transpose;
+        tensor_dims = reverse_dims;
     }
-    //re-assign tensor_dims
+    //case 2: use user's specified tensor_dims
     if (nullptr != dims) {
         tensor_dims = dims;
     }
+    //case 3: transpose for dst tensor
+    if (b_transpose) {
+        GGML_ASSERT(tensor != nullptr); //ensure ggml_tensor is not nullptr for this special case
+
+        get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_get_tensor_rank(tensor));
+        tensor_dims = transpose_dims;
+#if 0
+        for (size_t idx = 0; idx < 4; idx++) {
+            GGMLQNN_LOG_DEBUG("origin dim[%d]=%d\n", idx, reverse_dims[idx]);
+        }
+        for (size_t idx = 0; idx < 4; idx++) {
+            GGMLQNN_LOG_DEBUG("trans  dim[%d]=%d\n", idx, transpose_dims[idx]);
+        }
+#endif
+    }
 
     Qnn_Tensor_t qnn_tensor = {
             .version= QNN_TENSOR_VERSION_1,
@@ -2989,25 +3049,6 @@ static void dump_op_info(const struct ggml_tensor * tensor) {
     print_tensors_info(nullptr, nullptr, src0, src1, dst);
 }
 
-//TODO: currently only support offloading 2D matrix to QNN backend
-static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, uint32_t * ggml_dimensions, uint32_t rank) {
-    if (rank > GGML_MAX_DIMS) {
-        GGMLQNN_LOG_WARN("invalid params");
-        return;
-    }
-    if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) {
-        GGMLQNN_LOG_WARN("invalid params");
-        return;
-    }
-    for (size_t idx = 0; idx < GGML_MAX_DIMS; idx++)
-        qnn_dimensions[idx] = ggml_dimensions[idx];
-
-    if (rank >= 2) {
-        qnn_dimensions[rank - 1] = ggml_dimensions[rank - 2];
-        qnn_dimensions[rank - 2] = ggml_dimensions[rank - 1];
-    }
-}
-
 // =================================================================================================
 //  section-6: implementation of ggml-qnn backend
 // =================================================================================================
@@ -3056,10 +3097,9 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     }
 
     if (tensor->op == GGML_OP_MUL_MAT) {
-        dump_op_info(tensor);
         if (src0_rank != src1_rank) // make QNN SDK happy
             return false;
-        if (src0_rank < 2) // make QNN SDK happy
+        if (src0_rank < 2) // QNN's limitation, make QNN SDK happy
             return false;
         if (src0_rank > 3) //TODO: 4D matrix
             return false;
@@ -3327,7 +3367,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
     bool graph_initialized                      = false;
     qnn_perf op_perf                            = qnn_perf("ggml_qnn_mul_mat");
     qnn_instance * instance                     = nullptr;
-    ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *) backend->context;
+    ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *)backend->context;
     Qnn_GraphHandle_t graph_handle              = nullptr;
     Qnn_Tensor_t * p_tensor0                    = nullptr;
     Qnn_Tensor_t * p_tensor1                    = nullptr;
@@ -3361,11 +3401,10 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         p_param_tensor          = tensors[3];
         p_tensor2_transpose     = tensors[4];
     } else {
-        p_tensor0 = ggml_qnn_create_general_tensor(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor1 = ggml_qnn_create_general_tensor(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor2 = ggml_qnn_create_general_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor1 = GQCGT(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor2 = GQCGT(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
     }
-
     print_tensors_info(__func__, ctx, src0, src1, dst);
 
     //ensure QNN tensor has correct tensor type
@@ -3418,9 +3457,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
                 {0, 1, 3, 2},
         };
         uint32_t param_tensor_dims[1]   = {src0_rank};
-        p_param_tensor = ggml_qnn_create_general_tensor(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32,
-                                                        1, param_tensor_dims,
-                                                        (void *) (param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
+        p_param_tensor = GQCGT(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, param_tensor_dims, (void *)(param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
 
         //step-3: create compute tensor from ggml tensor
@@ -3433,13 +3470,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
 
         //step-4: create a transpose tensor
-        uint32_t tensor2_transpose_dims[GGML_MAX_DIMS] = {};
-        p_tensor2_transpose = ggml_qnn_create_general_tensor(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        get_qnn_dimensions_from_ggml_dimensions(tensor2_transpose_dims, tensor_2_dimensions, ggml_get_tensor_rank(dst));
-        //save pointer because the dimensions of tensor p_tensor2_transpose will be changed later
-        uint32_t * tensor2_dimensions_transpose = QNN_VER_PTR(*p_tensor2_transpose)->dimensions;
-        //update dimensions of tensor p_tensor2_transpose to make QNN SDK happy
-        QNN_VER_PTR(*p_tensor2_transpose)->dimensions = tensor2_transpose_dims;
+        p_tensor2_transpose = GQCGT(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0, true);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2_transpose));
 
         //step-5: compose qnn graph: add mat_mul node
@@ -3452,6 +3483,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
 
         Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0, *p_tensor1};
         Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
+#if 0
         Qnn_OpConfig_t out_0 = {
                 QNN_OPCONFIG_VERSION_1, .v1 =
                         {"ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
@@ -3462,6 +3494,10 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
                          1,
                          out_0_outputs}
         };
+#else
+        Qnn_OpConfig_t out_0 = create_op_config("ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
+                                                out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
+#endif
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
 
         //step-5: compose qnn graph: add transpose node
@@ -3472,10 +3508,11 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         };
         Qnn_Tensor_t out_trans1_0_inputs[]  = {*p_tensor2_transpose};
         Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
+#if 0
         Qnn_OpConfig_t out_trans1_0 = {
                 QNN_OPCONFIG_VERSION_1,
                 .v1 =  {"ggmlqnn_mulmat_transpose_opconfig",
-                        "qti.aisw",
+                        QNN_OP_PACKAGE_NAME_QTI_AISW,
                         QNN_OP_TRANSPOSE, 1,
                         out_trans1_0_params,
                         1,
@@ -3483,6 +3520,10 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
                         1,
                         out_trans1_0_outputs}
         };
+#else
+        Qnn_OpConfig_t out_trans1_0 = create_op_config("ggmlqnn_mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
+                         out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
+#endif
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
 
         //step-6: finalize qnn graph and execute qnn graph
@@ -3501,15 +3542,8 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         ggml_op_mulmat_tensors.push_back(p_tensor2);
         ggml_op_mulmat_tensors.push_back(p_param_tensor);
         ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
-
         auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
         instance->_qnn_graph_map[graph_name] = graph_item;
-
-        //avoid cleanup these resource to make test_backend_ops happy
-        //free_qnn_tensor(p_param_tensor);
-        //restore pointer to avoid memory leak
-        QNN_VER_PTR(*p_tensor2_transpose)->dimensions = tensor2_dimensions_transpose;
-        //free_qnn_tensor(p_tensor2_transpose);
     } else {
         QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
         QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
@@ -3522,7 +3556,6 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         Qnn_Tensor_t tensor_outputs[] = {
                 *p_tensor2
         };
-        //attention:
         // this is the second technical approach of "how to utilize the Hexagon NPU maximally" through
         // QNN SDK, details could be found at
         // https://github.com/kantv-ai/llama.cpp/wiki/mapping-ggml-compute-graph-to-QNN-compute-graph

From c8058be481e4b0921ae6a6bc333348537dbe5052 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Mon, 24 Feb 2025 09:58:42 +0800
Subject: [PATCH 083/200] ggml-qnn: fix a minor typo in source code

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 1b1e280f09505..120cea777ea20 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -2654,20 +2654,20 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         GGMLQNN_LOG_DEBUG("load QNN system lib successfully\n");
     }
 
-    std::string bakend_lib_path = _lib_path + _backend_name;
-    if (0 == _lib_path_to_backend_id.count(bakend_lib_path)) {
-        int is_load_ok = load_backend(bakend_lib_path, saver_config);
+    std::string backend_lib_path = _lib_path + _backend_name;
+    if (0 == _lib_path_to_backend_id.count(backend_lib_path)) {
+        int is_load_ok = load_backend(backend_lib_path, saver_config);
         if (0 != is_load_ok) {
             GGMLQNN_LOG_WARN("failed to load QNN backend\n");
             return 2;
         }
     }
 
-    backend_id = _lib_path_to_backend_id[bakend_lib_path];
+    backend_id = _lib_path_to_backend_id[backend_lib_path];
     if (0 == _loaded_backend.count(backend_id) ||
         0 == _loaded_lib_handle.count(backend_id)) {
         GGMLQNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, loaded lib_handle count=%zu\n",
-              bakend_lib_path.c_str(),
+              backend_lib_path.c_str(),
               _loaded_backend.count(backend_id),
               _loaded_lib_handle.count(backend_id));
         return 3;

From 6a598ef6db995447326451eb4ae04c472494864e Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Mon, 24 Feb 2025 16:59:12 +0800
Subject: [PATCH 084/200] build: avoid ggml-qnn backend breaking other
 backend's builds

---
 ggml/src/ggml-qnn/CMakeLists.txt | 35 --------------------------------
 1 file changed, 35 deletions(-)
 delete mode 100644 ggml/src/ggml-qnn/CMakeLists.txt

diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt
deleted file mode 100644
index 1156c98fbc9d7..0000000000000
--- a/ggml/src/ggml-qnn/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-message(STATUS "Using QNN backend")
-
-if(CMAKE_SYSTEM_NAME STREQUAL "Android")
-    find_library(LOG_LIB log)
-    set(QNN_LINK_LIBRARIES ${LOG_LIB})
-    set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
-elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
-    set(QNN_DEFAULT_LIB_SEARCH_PATH "C:\\" CACHE STRING "customized library search path for QNN backend")
-else()
-    message(FATAL_ERROR "QNN now only available on Android and Windows(Windows on ARM)")
-endif()
-
-if(NOT DEFINED GGML_QNN_SDK_PATH)
-# try read from environment variable
-    if(DEFINED ENV{QNN_SDK_PATH})
-        set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH})
-    else()
-        message(FATAL_ERROR "GGML_QNN_SDK_PATH not defined")
-    endif()
-endif()
-
-message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}")
-
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
-
-file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
-    ggml_add_backend_library(ggml-qnn
-    ${QNN_SOURCES}
-)
-
-target_include_directories(ggml-qnn PRIVATE ${GGML_QNN_SDK_PATH}/include/QNN ${CMAKE_CURRENT_LIST_DIR})
-target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES})
-
-string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
-target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/")

From 4b3f2415deb7af5dc63f261a5a803f619b4f2324 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 25 Feb 2025 08:22:27 +0800
Subject: [PATCH 085/200] ggml-qnn: remove redundant codes to make PR reviewers
 happy

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 409 ++-------------------------------
 1 file changed, 14 insertions(+), 395 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 120cea777ea20..8b33d346bd91a 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -18,8 +18,6 @@
  * - GGML_OP_MUL:    this is a simple skeleton, can expand other ggml ops according to expertise
  * - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex ggml ops accordingly
  *
- * of course, can porting ggml-qnn to Windows on ARM as need.
- *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
@@ -38,7 +36,6 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */
-
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
@@ -144,11 +141,8 @@ static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const
 #if (defined __ANDROID__) || (defined ANDROID)
             //for Android application(standard APP or command line tool)
             __android_log_print(ANDROID_LOG_INFO, "ggml-qnn", "%s\n", s_ggmlqnn_log_internal_buf);
-#endif
-#if (defined __ANDROID__) || (defined ANDROID)
-            //do nothing when running on Snapdragon based Android device
 #else
-            //for Snapdragon based WoA(Windows on ARM) device
+            //for Snapdragon based WoA(Windows on ARM) device or Linux
             printf("%s\n", s_ggmlqnn_log_internal_buf);
 #endif
         }
@@ -167,8 +161,6 @@ static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const
     class_name(class_name &&) = delete;         \
     void operator=(class_name &&) = delete
 
-#define GGMLQNN_MEM_ADD(alignment)              (sizeof (size_t) + alignment)
-#define GGMLQNN_MEM_MASK(alignment)             ((uintptr_t)alignment - 1)
 #define GQCGT                                   ggml_qnn_create_general_tensor
 
 static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
@@ -178,62 +170,36 @@ static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
                                       offset % static_cast<intptr_t>(alignment));
 }
 
-static void * ggmlqnn_mallocz_aligned(size_t size, size_t alignment) {
-    uint8_t * buffer = NULL;
-    size_t * sp = NULL;
-    buffer = static_cast<uint8_t *>(calloc(1, size + GGMLQNN_MEM_ADD(alignment)));
-    if (!buffer)
-        return NULL;
-    sp = (size_t *)buffer;
-    *sp = size;
-    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
-    buffer[-1] = buffer - (uint8_t *)sp;
-    return buffer;
-}
-
-static void * ggmlqnn_malloc_aligned(size_t size, size_t alignment) {
-    uint8_t * buffer = NULL;
-    size_t * sp = NULL;
-    buffer = static_cast<uint8_t *>(malloc(size + GGMLQNN_MEM_ADD(alignment)));
-    if (!buffer)
-        return NULL;
-    sp = (size_t *)buffer;
-    *sp = size;
-    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
-    buffer[-1] = buffer - (uint8_t *)sp;
-    return buffer;
-}
-
-static void ggmqnn_free_aligned(void * ptr) {
-    uint8_t * old = (uint8_t *)ptr;
-    if (!old)
-        return;
-    old -= old[-1];
-    free(old);
-}
-
 static size_t get_system_total_memory_in_bytes() {
+#if defined(__ANDROID__) || defined(__linux__)
     struct sysinfo info = {};
-    if (sysinfo(&info) == 0) {
+    if (0 == sysinfo(&info)) {
         return (info.totalram + info.totalswap) * info.mem_unit;
     }
-
     auto pages = (size_t)sysconf(_SC_PHYS_PAGES);
     auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
 
     return pages * page_size;
+#else
+    //TODO: Snapdragon based WoA(Windows on ARM)
+    return 0;
+#endif
 }
 
 static size_t get_system_free_memory_in_bytes() {
+#if defined(__ANDROID__) || defined(__linux__)
     struct sysinfo info = {};
-    if (sysinfo(&info) == 0) {
+    if (0 == sysinfo(&info)) {
         return (info.freeram + info.freeswap) * info.mem_unit;
     }
-
     auto avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES);
     auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
 
     return avail_pages * page_size;
+#else
+    //TODO: Snapdragon based WoA(Windows on ARM)
+    return 0;
+#endif
 }
 
 static size_t ggmlqnn_memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) {
@@ -288,34 +254,7 @@ static void * ggmlqnn_host_malloc(size_t n) {
 
 #define VALIDATE_TENSOR_VERSION(tensor, err)            VALIDATE(validate_tensor_version(tensor), err)
 
-#define VALIDATE_OP_CONFIG_VERSION(op, err)             VALIDATE(validate_op_config_version(op), err)
-
 #define QNN_VER_PTR(x)                                  (&((x).v1))
-#define QNN_OP_CFG_VALID(op_config)                      ((op_config).version == QNN_OPCONFIG_VERSION_1)
-
-#define QNN_OP_CFG_GET_NAME(op_config)                   get_qnn_oponfig_name(op_config)
-#define QNN_OP_CFG_GET_PACKAGE_NAME(op_config)           get_qnn_op_config_packagename(op_config)
-#define QNN_OP_CFG_GET_TYPE_NAME(op_config)              get_qnn_op_config_typename(op_config)
-#define QNN_OP_CFG_GET_NUM_PARAMS(op_config)             get_qnn_op_config_numparams(op_config)
-#define QNN_OP_CFG_GET_PARAMS(op_config)                 get_qnn_op_config_params(op_config)
-#define QNN_OP_CFG_GET_NUM_INPUTS(op_config)             get_qnn_op_config_numinputs(op_config)
-#define QNN_OP_CFG_GET_INPUTS(op_config)                 get_qnn_op_config_inputs(op_config)
-#define QNN_OP_CFG_GET_NUM_OUTPUTS(op_config)            get_qnn_op_config_numoutputs(op_config)
-#define QNN_OP_CFG_GET_OUTPUTS(op_config)                get_qnn_op_config_outputs(op_config)
-
-#define QNN_OP_CFG_SET_NAME(op_config, value)            set_qnn_op_config_name(op_config, value)
-#define QNN_OP_CFG_SET_PACKAGE_NAME(op_config, value)    set_qnn_op_config_packagename(op_config, value)
-#define QNN_OP_CFG_SET_TYPE_NAME(op_config, value)       set_qnn_op_config_typename(op_config, value)
-
-#define QNN_OP_CFG_SET_PARAMS(op_config, num_of_params, params) \
-  set_qnn_op_config_params(op_config, num_of_params, params)
-
-#define QNN_OP_CFG_SET_INPUTS(op_config, num_of_inputs, inputTensors) \
-  set_qnn_op_config_inputs(op_config, num_of_inputs, inputTensors)
-
-#define QNN_OP_CFG_SET_OUTPUTS(op_config, num_of_outputs, output_tensors) \
-  set_qnn_op_config_outputs(op_config, num_of_outputs, output_tensors)
-
 #define QNN_TENSOR_GET_ID(tensor)                       get_qnn_tensorid(tensor)
 #define QNN_TENSOR_GET_NAME(tensor)                     get_qnn_tensorname(tensor)
 #define QNN_TENSOR_GET_TYPE(tensor)                     get_qnn_tensortype(tensor)
@@ -350,190 +289,6 @@ static inline int validate_tensor_version(Qnn_Tensor_t tensor) {
     return 0;
 }
 
-[[maybe_unused]] static inline int validate_op_config_version(Qnn_OpConfig_t op_config) {
-    if (op_config.version != QNN_OPCONFIG_VERSION_1) {
-        GGMLQNN_LOG_WARN("validate_op_config_version() op %s, got unsupported version %d\n",
-              op_config.v1.name,
-              op_config.version);
-        return 1;
-    }
-    return 0;
-}
-
-static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.name;
-    }
-    return nullptr;
-}
-
-[[maybe_unused]] static inline const char * get_qnn_oponfig_name(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_oponfig_name(*op_config);
-}
-
-static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.packageName;
-    }
-    return nullptr;
-}
-
-[[maybe_unused]] static inline const char * get_qnn_op_config_packagename(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_packagename(*op_config);
-}
-
-static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.typeName;
-    }
-    return nullptr;
-}
-
-[[maybe_unused]] static inline const char * get_qnn_op_config_typename(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_typename(*op_config);
-}
-
-static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.numOfParams;
-    }
-    return 0u;
-}
-
-[[maybe_unused]] static inline uint32_t get_qnn_op_config_numparams(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_numparams(*op_config);
-}
-
-static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.params;
-    }
-    return nullptr;
-}
-
-[[maybe_unused]] static inline const Qnn_Param_t * get_qnn_op_config_params(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_params(*op_config);
-}
-
-static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.numOfInputs;
-    }
-    return 0u;
-}
-
-[[maybe_unused]] static inline uint32_t get_qnn_op_config_numinputs(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_numinputs(*op_config);
-}
-
-static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.inputTensors;
-    }
-    return nullptr;
-}
-
-[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_inputs(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_inputs(*op_config);
-}
-
-static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.numOfOutputs;
-    }
-    return 0u;
-}
-
-[[maybe_unused]] static inline uint32_t get_qnn_op_config_numoutputs(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_numoutputs(*op_config);
-}
-
-static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t & op_config) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        return op_config.v1.outputTensors;
-    }
-    return nullptr;
-}
-
-[[maybe_unused]] static inline const Qnn_Tensor_t * get_qnn_op_config_outputs(const Qnn_OpConfig_t * op_config) {
-    return get_qnn_op_config_outputs(*op_config);
-}
-
-static inline void set_qnn_op_config_name(Qnn_OpConfig_t & op_config, const char * name) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        op_config.v1.name = name;
-    }
-}
-
-[[maybe_unused]] static inline void set_qnn_op_config_name(Qnn_OpConfig_t * op_config, const char * name) {
-    set_qnn_op_config_name(*op_config, name);
-}
-
-static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t & op_config, const char * package_name) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        op_config.v1.packageName = package_name;
-    }
-}
-
-[[maybe_unused]] static inline void set_qnn_op_config_packagename(Qnn_OpConfig_t * op_config, const char * package_name) {
-    set_qnn_op_config_packagename(*op_config, package_name);
-}
-
-static inline void set_qnn_op_config_typename(Qnn_OpConfig_t & op_config, const char * type_name) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        op_config.v1.typeName = type_name;
-    }
-}
-
-[[maybe_unused]] static inline void set_qnn_op_config_typename(Qnn_OpConfig_t * op_config, const char * type_name) {
-    set_qnn_op_config_typename(*op_config, type_name);
-}
-
-static inline void set_qnn_op_config_params(Qnn_OpConfig_t & op_config,
-                                 uint32_t num_of_params,
-                                 Qnn_Param_t * params) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        op_config.v1.numOfParams = num_of_params;
-        op_config.v1.params      = params;
-    }
-}
-
-[[maybe_unused]] static inline void set_qnn_op_config_params(Qnn_OpConfig_t * op_config,
-                                 uint32_t num_of_params,
-                                 Qnn_Param_t * params) {
-    set_qnn_op_config_params(*op_config, num_of_params, params);
-}
-
-static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t & op_config,
-                                 uint32_t num_of_inputs,
-                                 Qnn_Tensor_t * input_tensors) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        op_config.v1.numOfInputs  = num_of_inputs;
-        op_config.v1.inputTensors = input_tensors;
-    }
-}
-
-[[maybe_unused]] static inline void set_qnn_op_config_inputs(Qnn_OpConfig_t * op_config,
-                                 uint32_t num_of_inputs,
-                                 Qnn_Tensor_t * input_tensors) {
-    set_qnn_op_config_inputs(*op_config, num_of_inputs, input_tensors);
-}
-
-static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t & op_config,
-                                  uint32_t num_of_outputs,
-                                  Qnn_Tensor_t * output_tensors) {
-    if (op_config.version == QNN_OPCONFIG_VERSION_1) {
-        op_config.v1.numOfOutputs  = num_of_outputs;
-        op_config.v1.outputTensors = output_tensors;
-    }
-}
-
-[[maybe_unused]] static inline void set_qnn_op_config_outputs(Qnn_OpConfig_t * op_config,
-                                  uint32_t num_of_outputs,
-                                  Qnn_Tensor_t * output_tensors) {
-    set_qnn_op_config_outputs(*op_config, num_of_outputs, output_tensors);
-}
-
 static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.id;
@@ -542,10 +297,6 @@ static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) {
     return 0u;
 }
 
-[[maybe_unused]] static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensorid(*tensor);
-}
-
 static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.name;
@@ -553,10 +304,6 @@ static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) {
     return nullptr;
 }
 
-static inline const char * get_qnn_tensorname(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensorname(*tensor);
-}
-
 static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.type;
@@ -564,10 +311,6 @@ static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) {
     return QNN_TENSOR_TYPE_UNDEFINED;
 }
 
-[[maybe_unused]] static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensortype(*tensor);
-}
-
 static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.dataFormat;
@@ -575,10 +318,6 @@ static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_
     return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER;
 }
 
-[[maybe_unused]] static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_dataformat(*tensor);
-}
-
 static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.dataType;
@@ -586,10 +325,6 @@ static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor
     return QNN_DATATYPE_UNDEFINED;
 }
 
-[[maybe_unused]] static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_datatype(*tensor);
-}
-
 static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.quantizeParams;
@@ -597,10 +332,6 @@ static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t
     return QNN_QUANTIZE_PARAMS_INIT;
 }
 
-[[maybe_unused]] static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_quantparams(*tensor);
-}
-
 static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.rank;
@@ -608,10 +339,6 @@ static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) {
     return 0u;
 }
 
-[[maybe_unused]] static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_rank(*tensor);
-}
-
 static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.dimensions;
@@ -619,10 +346,6 @@ static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor)
     return nullptr;
 }
 
-[[maybe_unused]] static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_dimensions(*tensor);
-}
-
 static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.memType;
@@ -630,153 +353,72 @@ static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & te
     return QNN_TENSORMEMTYPE_UNDEFINED;
 }
 
-[[maybe_unused]] static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_memtype(*tensor);
-}
-
-static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.clientBuf;
-    }
-    return QNN_CLIENT_BUFFER_INIT;
-}
-
-[[maybe_unused]] static inline Qnn_ClientBuffer_t get_qnn_tensor_clientbuf(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_clientbuf(*tensor);
-}
-
-static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.memHandle;
-    }
-    return nullptr;
-}
-
-[[maybe_unused]] static inline Qnn_MemHandle_t get_qnn_tensor_memhandle(const Qnn_Tensor_t * tensor) {
-    return get_qnn_tensor_memhandle(*tensor);
-}
-
 static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.id = id;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_id(Qnn_Tensor_t * tensor, uint32_t id) {
-    set_qnn_tensor_id(*tensor, id);
-}
-
 static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.name = name;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_name(Qnn_Tensor_t * tensor, const char * name) {
-    set_qnn_tensor_name(*tensor, name);
-}
-
 static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.type = type;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_type(Qnn_Tensor_t * tensor, Qnn_TensorType_t type) {
-    set_qnn_tensor_type(*tensor, type);
-}
-
 static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.dataFormat = format;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t * tensor, Qnn_TensorDataFormat_t format) {
-    set_qnn_tensor_dataformat(*tensor, format);
-}
-
 static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.dataType = dataType;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_datatype(Qnn_Tensor_t * tensor, Qnn_DataType_t dataType) {
-    set_qnn_tensor_datatype(*tensor, dataType);
-}
-
 static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.quantizeParams = params;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t * tensor, Qnn_QuantizeParams_t params) {
-    set_qnn_tensor_quantparams(*tensor, params);
-}
-
 static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.rank = rank;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_rank(Qnn_Tensor_t * tensor, uint32_t rank) {
-    set_qnn_tensor_rank(*tensor, rank);
-}
-
 static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.dimensions = dims;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t * tensor, uint32_t * dims) {
-    set_qnn_tensor_dimensions(*tensor, dims);
-}
-
 static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.memType = memType;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_memtype(Qnn_Tensor_t * tensor, Qnn_TensorMemType_t memType) {
-    set_qnn_tensor_memtype(*tensor, memType);
-}
-
 static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.clientBuf = clientBuf;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t * tensor, Qnn_ClientBuffer_t clientBuf) {
-    set_qnn_tensor_clientbuf(*tensor, clientBuf);
-}
-
 static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.memHandle = handle;
     }
 }
 
-[[maybe_unused]] static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t * tensor, Qnn_MemHandle_t handle) {
-    set_qnn_tensor_memhandle(*tensor, handle);
-}
-
-inline static Qnn_Tensor_t qnn_tensor_init(Qnn_TensorVersion_t version) {
-    Qnn_Tensor_t tensor;
-    tensor.version = version;
-    if (version == QNN_TENSOR_VERSION_1) {
-        tensor.v1 = QNN_TENSOR_V1_INIT;
-    } else if (version == QNN_TENSOR_VERSION_2) {
-        tensor.v2 = QNN_TENSOR_V2_INIT;
-    }
-    return tensor;
-}
-
 static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
     int err = 0;
     VALIDATE_TENSOR_VERSION(src, err);
@@ -2445,22 +2087,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
     }
     _loaded_lib_handle[backend_id] = lib_handle;
     _backend_id = backend_id;
-
-#if 0 // leave them here for further use
-    QnnSaver_Config_t outputdir_cfg;
-    outputdir_cfg.option = QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY;
-    outputdir_cfg.outputDirectory = "/data/local/tmp/";
-    QnnSaver_Config_t backendid_cfg;
-    backendid_cfg.option = QNN_SAVER_CONFIG_OPTION_BACKEND_ID;
-    backendid_cfg.backendId = _backend_id;
-
-    const QnnSaver_Config_t * saver_cfg[] = {&outputdir_cfg, &backendid_cfg, nullptr};
-    if (0 == QnnSaver_initialize(saver_cfg)) {
-        GGMLQNN_LOG_INFO("QnnSaver_initialize successfully");
-    } else {
-        GGMLQNN_LOG_WARN("QnnSaver_initialize failure");
-    }
-#endif
+    
     auto saver_initialize =
             load_qnn_functionpointers<_pfn_QnnSaver_initialize *>(
             _loaded_lib_handle[backend_id], "QnnSaver_initialize");
@@ -3682,14 +3309,6 @@ static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t
     memset(ctx->buffer, value, ctx->buffer_size);
 }
 
-[[maybe_unused]]static void ggml_backend_qnn_buffer_reset(ggml_backend_buffer_t buffer) {
-    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
-    for (auto * sub_buffer : ctx->sub_buffers) {
-        free(sub_buffer);
-    }
-    ctx->sub_buffers.clear();
-}
-
 static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = {
         /* .free_buffer     = */ ggml_backend_qnn_buffer_free_buffer,
         /* .get_base        = */ ggml_backend_qnn_buffer_get_base,

From 7ea4d3a45277bb234f2ff80f18d984efa2dd1412 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 25 Feb 2025 13:49:04 +0800
Subject: [PATCH 086/200] ggml-qnn: refine code format

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 285 ++++++++++++++++++++-------------
 1 file changed, 177 insertions(+), 108 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 8b33d346bd91a..aaf9fd694f8b9 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -44,12 +44,14 @@
 #include <inttypes.h>
 #include <math.h>
 #include <time.h>
+#if defined(__ANDROID__) || defined(__linux__)
 #include <unistd.h>
 #include <dlfcn.h>
 #include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/sysinfo.h>
 #include <unistd.h>
+#endif
 
 #include <string>
 #include <vector>
@@ -77,6 +79,10 @@
 #include "android/log.h"
 #endif
 
+#if defined(_WIN32) || defined(_MSC_VER)
+#include <Windows.h>
+#endif
+
 #include "QnnTypes.h"
 #include "QnnCommon.h"
 #include "QnnContext.h"
@@ -98,7 +104,7 @@
 // =================================================================================================
 class qnn_instance;
 struct ggml_backend_qnn_context;
-static int free_qnn_tensor(Qnn_Tensor_t * tensor);
+static int  free_qnn_tensor(Qnn_Tensor_t * tensor);
 static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph);
 static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
 static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
@@ -180,9 +186,11 @@ static size_t get_system_total_memory_in_bytes() {
     auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
 
     return pages * page_size;
-#else
+#elif defined(_WIN32) || defined(_MSC_VER)
     //TODO: Snapdragon based WoA(Windows on ARM)
     return 0;
+#else
+#error "ggml-qnn only support WoA, Android, Linux"
 #endif
 }
 
@@ -196,9 +204,11 @@ static size_t get_system_free_memory_in_bytes() {
     auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
 
     return avail_pages * page_size;
-#else
+#elif defined(_WIN32) || defined(_MSC_VER)
     //TODO: Snapdragon based WoA(Windows on ARM)
     return 0;
+#else
+#error "ggml-qnn only support WoA, Android, Linux"
 #endif
 }
 
@@ -218,12 +228,19 @@ static char * ggmlqnn_strndup(const char * source, size_t maxlen) {
 }
 
 static void * ggmlqnn_host_malloc(size_t n) {
-    void * data = NULL;
-    int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
+#if defined(__ANDROID__) || defined(__linux__)
+    void * data = nullptr;
+    int result = posix_memalign((void **)&data, sysconf(_SC_PAGESIZE), n);
     if (result != 0) {
         GGMLQNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__);
-        return NULL;
+        return nullptr;
     }
+#elif defined(_WIN32) || defined(_MSC_VER)
+    //TODO: Snapdragon based WoA(Windows on ARM)
+    return nullptr;
+#else
+#error "ggml-qnn only support WoA, Android, Linux"
+#endif
 
     return data;
 }
@@ -231,15 +248,6 @@ static void * ggmlqnn_host_malloc(size_t n) {
 // =================================================================================================
 //  section-4: QNN helper macro / data structure / function
 // =================================================================================================
-#define VALIDATE(value, status)                                                 \
-  do {                                                                          \
-    status = value;                                                             \
-    if (status != QNN_SUCCESS) {                                                \
-      GGMLQNN_LOG_WARN("%s expected QNN_SUCCESS\n", #value);                    \
-      return status;                                                            \
-    }                                                                           \
-  } while (0)
-
 #define CHECK_QNN_API(error, result)                                            \
     do {                                                                        \
         error = (result);                                                       \
@@ -252,8 +260,6 @@ static void * ggmlqnn_host_malloc(size_t n) {
         }                                                                       \
     } while (0)
 
-#define VALIDATE_TENSOR_VERSION(tensor, err)            VALIDATE(validate_tensor_version(tensor), err)
-
 #define QNN_VER_PTR(x)                                  (&((x).v1))
 #define QNN_TENSOR_GET_ID(tensor)                       get_qnn_tensorid(tensor)
 #define QNN_TENSOR_GET_NAME(tensor)                     get_qnn_tensorname(tensor)
@@ -279,16 +285,6 @@ static void * ggmlqnn_host_malloc(size_t n) {
 #define QNN_TENSOR_SET_CLIENT_BUF(tensor, value)        set_qnn_tensor_clientbuf(tensor, value)
 #define QNN_TENSOR_SET_MEM_HANDLE(tensor, value)        set_qnn_tensor_memhandle(tensor, value)
 
-static inline int validate_tensor_version(Qnn_Tensor_t tensor) {
-    if (tensor.version != QNN_TENSOR_VERSION_1) {
-        GGMLQNN_LOG_WARN("validate_tensor_version() tensor %s, got unsupported version %d\n",
-              tensor.v1.name,
-              tensor.version);
-        return 1;
-    }
-    return 0;
-}
-
 static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.id;
@@ -421,7 +417,6 @@ static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle
 
 static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
     int err = 0;
-    VALIDATE_TENSOR_VERSION(src, err);
 
     dst.version = src.version;
     QNN_TENSOR_SET_NAME(
@@ -492,7 +487,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
 
 static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
     int err = 0;
-    VALIDATE_TENSOR_VERSION(*tensor, err);
+
     free((void *) QNN_TENSOR_GET_NAME(*tensor));
 
     Qnn_QuantizeParams_t src_qparam      = QNN_TENSOR_GET_QUANT_PARAMS(*tensor);
@@ -511,7 +506,6 @@ static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
     return err;
 }
 
-
 static size_t qnn_datatype_size(Qnn_DataType_t qnn_type) {
     switch (qnn_type) {
         case QNN_DATATYPE_FLOAT_32:
@@ -720,6 +714,11 @@ enum qcom_chipset_soc_model {
     SM8550 = 43,  // v73, SD 8 Gen 2
     SM8650 = 57,  // v75, SD 8 Gen 3
     SM8750 = 69,  // v79, SD 8 Gen 4
+#if defined(_WIN32) || defined(_MSC_VER)
+    SC7280X     = 44,
+    SC8280X     = 37,
+    SC8380XP    = 60,
+#endif
 };
 
 struct qcom_socinfo {
@@ -780,6 +779,29 @@ static struct qcom_socinfo g_qnn_soc_info_table[] = {
                 .vtcm_size_in_mb   = 8,
                 .soc_desc          = "Qualcomm SnapDragon 8 Gen 4"},
 
+#if defined(_WIN32) || defined(_MSC_VER)
+        /* Qualcomm SnapDragon 7c Gen 2 */
+        [SC7280X] = {
+                .soc_model         = SC7280X,
+                .htp_arch          = V68,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 7c Gen 2"},
+
+        /* Qualcomm SnapDragon 8cx Gen 3 */
+        [SC8280X] = {
+                .soc_model         = SC8280X,
+                .htp_arch          = V68,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8cx Gen 3"},
+
+        /* Qualcomm SnapDragon 8cx Gen 4 */
+        [SC8380XP] = {
+                .soc_model         = SC8380XP,
+                .htp_arch          = V73,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8cx Gen 4"},
+#endif
+
 };
 
 struct ggml_backend_qnn_context {
@@ -820,7 +842,11 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .threads              = 1,
                 .name                 = "qnn-cpu",
                 .desc                 = "Qualcomm Kryo CPU",
+#if defined(_WIN32) || defined(_MSC_VER)
+                .lib                  = "QnnCpu.dll",
+#else
                 .lib                  = "libQnnCpu.so",
+#endif
                 .instance             = nullptr,
                 .backend              = nullptr,
                 .raw_interface        = {},
@@ -831,7 +857,11 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .threads              = 1,
                 .name                 = "qnn-gpu",
                 .desc                 = "Qualcomm Adreno GPU",
+#if defined(_WIN32) || defined(_MSC_VER)
+                .lib                  = "QnnGpu.dll",
+#else
                 .lib                  = "libQnnGpu.so",
+#endif
                 .instance             = nullptr,
                 .backend              = nullptr,
                 .raw_interface        = {},
@@ -842,7 +872,11 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .threads              = 1,
                 .name                 = "qnn-npu",
                 .desc                 = "Qualcomm NPU(Hexagon Tensor Processor)",
+#if defined(_WIN32) || defined(_MSC_VER)
+                .lib                  = "QnnHtp.dll",
+#else
                 .lib                  = "libQnnHtp.so",
+#endif
                 .instance             = nullptr,
                 .backend              = nullptr,
                 .raw_interface        = {},
@@ -1252,8 +1286,8 @@ static Qnn_Tensor_t * ggml_qnn_create_compute_tensor(const ggml_tensor * tensor)
     qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type);
     Qnn_Tensor_t * p_qnn_tensor = ggml_qnn_create_general_tensor(tensor, nullptr,
                                   qnn_tensor_type, qnn_data_type,
-                             ggml_n_dims(tensor), dimensions,
-                             nullptr, 0);
+                                  ggml_n_dims(tensor), dimensions,
+                                  nullptr, 0);
 
     return p_qnn_tensor;
 }
@@ -1351,7 +1385,14 @@ class qnn_perf {
 
 template<typename Fn>
 Fn load_qnn_functionpointers(void * handle, const char * function_name) {
+#if defined(__ANDROID__) || defined(__linux__)
     return reinterpret_cast<Fn>(dlsym(handle, function_name));
+#elif defined(_WIN32) || defined(_MSC_VER)
+    //TODO: Snapdragon based WoA(Windows on ARM)
+    return nullptr;
+#else
+#error "ggml-qnn only support WoA, Android, Linux"
+#endif
 }
 
 class qnn_interface {
@@ -1485,7 +1526,7 @@ class qnn_instance {
     using BackendIdType = decltype(QnnInterface_t{}.backendId);
 
     explicit qnn_instance(const std::string & lib_path, const std::string & backend_name,
-                                const std::string & model_name) :
+                          const std::string & model_name) :
             _lib_path(std::move(lib_path)),
             _backend_name(std::move(backend_name)),
             _model_name(std::move(model_name)) {};
@@ -1567,8 +1608,7 @@ class qnn_instance {
         if (_qnn_rpc_pollingtime > 0) {
             QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingtime;
             memset(&rpc_pollingtime, 0, sizeof(rpc_pollingtime));
-            rpc_pollingtime.option =
-                    QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
+            rpc_pollingtime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
             rpc_pollingtime.rpcPollingTimeConfig = _qnn_rpc_pollingtime;
             const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&rpc_pollingtime, nullptr};
             if (_qnn_htp_perfinfra) {
@@ -1653,16 +1693,15 @@ class qnn_instance {
     }
 
     void probe_device_meminfo() {
-        size_t candidate_size = 0;
-        uint8_t *rpc_buffer = nullptr;
-        const int SIZE_IN_MB = (1 << 20);
-        size_t probe_slots[] = {1024, 1536, 2048 - 48, 2048};
-        size_t probe_counts = sizeof(probe_slots) / sizeof(size_t);
+        size_t candidate_size   = 0;
+        uint8_t * rpc_buffer    = nullptr;
+        const int SIZE_IN_MB    = (1 << 20);
+        size_t probe_slots[]    = {1024, 1536, 2048 - 48, 2048};
+        size_t probe_counts     = sizeof(probe_slots) / sizeof(size_t);
         for (size_t idx = 0; idx < probe_counts; idx++) {
             rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem_internal(probe_slots[idx] * SIZE_IN_MB, 4));
             if (nullptr == rpc_buffer) {
-                GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx],
-                                  strerror(errno));
+                GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
                 break;
             } else {
                 candidate_size = probe_slots[idx];
@@ -1697,17 +1736,17 @@ class qnn_instance {
     void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) {
         _qnn_raw_system_interface = raw_interface;
     }
-    
+
     void * alloc_rpcmem_internal(size_t bytes, size_t alignment);
 
 private:
     static constexpr const int _required_num_providers = 1;
 
 private:
-    std::string _lib_path;
-    std::string _backend_name;
-    std::string _model_name;               // name of prebuilt QNN model, might be used in the future
-    BackendIdType _backend_id;
+    std::string     _lib_path;
+    std::string     _backend_name;
+    std::string     _model_name; // name of prebuilt QNN model, might be used in the future
+    BackendIdType   _backend_id;
 
     bool _debug_tensor                      = false; // flag to indicate if requested graph is to be run in debug mode
     bool _do_node_validations               = true;  // flag to indicate whether all add_node calls need to be validated
@@ -1715,17 +1754,15 @@ class qnn_instance {
 
     ggml_qnn_profile_level _profile_level   = ggml_qnn_profile_level::profile_detail;
 
-    qnn_interface _qnn_interface;
-
-    void * _system_lib_handle = nullptr;
+    void * _system_lib_handle               = nullptr;
 
-    Qnn_GraphHandle_t _qnn_graph_handle = nullptr;
+    Qnn_GraphHandle_t _qnn_graph_handle     = nullptr;
 
-    Qnn_LogHandle_t _qnn_log_handle = nullptr;
+    Qnn_LogHandle_t _qnn_log_handle         = nullptr;
 
     Qnn_ProfileHandle_t _qnn_profile_handle = nullptr;
 
-    Qnn_DeviceHandle_t _qnn_device_handle = nullptr;
+    Qnn_DeviceHandle_t _qnn_device_handle   = nullptr;
 
     Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
 
@@ -1733,10 +1770,11 @@ class qnn_instance {
 
     QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
 
-    QnnHtpDevice_PerfInfrastructure_t *_qnn_htp_perfinfra = nullptr;
-    uint32_t _qnn_power_configid = 1;
-    uint32_t _qnn_rpc_pollingtime = 9999; // 0-10000 us for high performing
+    QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr;
+    uint32_t _qnn_power_configid            = 1;
+    uint32_t _qnn_rpc_pollingtime           = 9999; // 0-10000 us for high performing
 
+    qnn_interface _qnn_interface;
     QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
     QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
 
@@ -1748,7 +1786,6 @@ class qnn_instance {
     static std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
     static std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
 
-    void * _rpc_lib_handle = nullptr;
     std::atomic_bool _rpcmem_initialized{false};
     pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
     pfn_rpc_mem_free _pfn_rpc_mem_free;
@@ -1757,12 +1794,13 @@ class qnn_instance {
     pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
     std::unordered_map<void *, void *> _rpcmem_store_map;
     std::unordered_map<void *, size_t> _rpcmem_usage_map;
-    size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
     size_t                             _rpcmem_usage    = 0;   // mempool usage in Mbytes
+    size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
 
     std::string _graph_name;
     QNNBackend _device_id;
-    bool       _enable_qnn_rpc = false; //TODO:unknown issue with QNN RPC feature
+    void * _rpc_lib_handle      = nullptr;
+    bool       _enable_qnn_rpc  = false; //TODO:unknown issue with QNN RPC feature
 
     DISABLE_COPY(qnn_instance);
     DISABLE_MOVE(qnn_instance);
@@ -1781,13 +1819,13 @@ void * qnn_instance::alloc_rpcmem_internal(size_t bytes, size_t alignment) {
 
     auto allocate_bytes = static_cast<int32_t>(bytes + alignment);
     void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes);
-    if (buf == nullptr) {
+    if (nullptr == buf) {
         GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
         return nullptr;
     }
 
     auto aligned_buf = reinterpret_cast<void *>(ggmlqnn_align_to(alignment,
-                                                         reinterpret_cast<intptr_t>(buf)));
+                                                reinterpret_cast<intptr_t>(buf)));
     bool status = _rpcmem_store_map.insert(std::pair<void *, void *>(aligned_buf, buf)).second;
     if (!status) {
         GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
@@ -1886,13 +1924,13 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
 
     if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
         GGMLQNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
-        return 4;
+        return 3;
     }
 
     int32_t mem_fd = rpcmem_to_fd(p_data);
     if (-1 == mem_fd) {
         GGMLQNN_LOG_WARN("failed to get file descriptor\n");
-        return 5;
+        return 4;
     }
     GGMLQNN_LOG_DEBUG("mem_fd %d\n", mem_fd);
     Qnn_MemDescriptor_t descriptor = {
@@ -1908,9 +1946,8 @@ int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
             /*numDescriptors=*/1,
             &handle);
     if (error != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error),
-              strerror(error));
-        return 6;
+        GGMLQNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), strerror(error));
+        return 5;
     } else {
         GGMLQNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
     }
@@ -1949,8 +1986,7 @@ Qnn_MemHandle_t  qnn_instance::register_rpcmem(void * p_data, const uint32_t ran
             {{mem_fd}}
     };
     Qnn_MemHandle_t handle = nullptr;
-    auto error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor,
-            /*numDescriptors=*/1, &handle);
+    auto error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle);
     if (error != QNN_SUCCESS) {
         GGMLQNN_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error));
         return nullptr;
@@ -1987,8 +2023,7 @@ void qnn_instance::unregister_rpcmem() {
         Qnn_MemHandle_t mem_handle = it->second;
         error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
         if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d\n",
-                         QNN_GET_ERROR_CODE(error));
+            GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error));
         } else {
             GGMLQNN_LOG_DEBUG("unregister shared memory ok");
         }
@@ -2020,15 +2055,22 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     GGMLQNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str());
 
+#if defined(__ANDROID__) || defined(__linux__)
     void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+#elif defined(_WIN32) || defined(_MSC_VER)
+    //TODO: Snapdragon based WoA(Windows on ARM)
+    void * lib_handle = nullptr;
+#else
+#error "ggml-qnn only support WoA, Android, Linux"
+#endif
     if (nullptr == lib_handle) {
         GGMLQNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror());
         return 1;
     }
 
-    auto get_providers =
-            load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(lib_handle,
-                                                          "QnnInterface_getProviders");
+    auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(
+                               lib_handle,
+                               "QnnInterface_getProviders");
     if (nullptr == get_providers) {
         GGMLQNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror());
         return 2;
@@ -2087,7 +2129,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
     }
     _loaded_lib_handle[backend_id] = lib_handle;
     _backend_id = backend_id;
-    
+
     auto saver_initialize =
             load_qnn_functionpointers<_pfn_QnnSaver_initialize *>(
             _loaded_lib_handle[backend_id], "QnnSaver_initialize");
@@ -2106,7 +2148,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
 
 int qnn_instance::unload_backend() {
     int dlclose_error = 0;
-    for (auto &it : _loaded_lib_handle) {
+    for (auto & it : _loaded_lib_handle) {
         dlclose_error = dlclose(it.second);
         if (dlclose_error != 0) {
             GGMLQNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror());
@@ -2126,13 +2168,27 @@ int qnn_instance::load_system() {
     std::string system_lib_path = _lib_path + "libQnnSystem.so";
     GGMLQNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str());
 
+#if defined(__ANDROID__) || defined(__linux__)
     _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+#elif defined(_WIN32) || defined(_MSC_VER)
+    //TODO: Snapdragon based WoA(Windows on ARM)
+    _system_lib_handle = nullptr;
+#else
+#error "ggml-qnn only support WoA, Android, Linux"
+#endif
     if (nullptr == _system_lib_handle) {
         GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
         //re-try with default path of QNN binary runtime lib
         _lib_path = "/data/local/tmp/";
         system_lib_path = _lib_path + "libQnnSystem.so";
+#if defined(__ANDROID__) || defined(__linux__)
         _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+#elif defined(_WIN32) || defined(_MSC_VER)
+        //TODO: Snapdragon based WoA(Windows on ARM)
+        _system_lib_handle = nullptr;
+#else
+#error "ggml-qnn only support WoA, Android, Linux"
+#endif
         if (nullptr == _system_lib_handle) {
             GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
             return 1;
@@ -2348,7 +2404,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
             if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
                     _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) {
                 GGMLQNN_LOG_WARN("unable to create profile handle in the backend\n");
-                return 7;
+                return 6;
             } else {
                 GGMLQNN_LOG_DEBUG("initialize qnn profile successfully\n");
             }
@@ -2364,10 +2420,17 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         }
     }
 
+#if defined(__ANDROID__) || defined(__linux__)
     _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+#elif defined(_WIN32) || defined(_MSC_VER)
+    //TODO: Snapdragon based WoA(Windows on ARM)
+    _rpc_lib_handle = nullptr;
+#else
+#error "ggml-qnn only support WoA, Android, Linux"
+#endif
     if (nullptr == _rpc_lib_handle) {
         GGMLQNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror());
-        return 9;
+        return 8;
     } else {
         GGMLQNN_LOG_DEBUG("load rpcmem lib successfully\n");
         set_rpcmem_initialized(true);
@@ -2381,7 +2444,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         || nullptr == _pfn_rpc_mem_to_fd) {
         GGMLQNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror());
         dlclose(_rpc_lib_handle);
-        return 10;
+        return 9;
     }
 
     if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy
@@ -2393,7 +2456,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
                                &_qnn_context_handle);
     if (nullptr == _qnn_context_handle) {
         GGMLQNN_LOG_WARN("why failed to initialize qnn context, error:%s\n", strerror(errno));
-        return 8;
+        return 10;
     } else {
         GGMLQNN_LOG_DEBUG("initialize qnn context successfully\n");
     }
@@ -2578,7 +2641,7 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend devi
 }
 
 int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation,
-                                   const QnnGraph_Config_t ** graph_configs) {
+                                 const QnnGraph_Config_t ** graph_configs) {
     int result = 0;
 
     if (nullptr == graph_name) {
@@ -2685,14 +2748,18 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
         return true;
     }
     if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE
-    || tensor->op == GGML_OP_TRANSPOSE || tensor->op == GGML_OP_VIEW
-    || tensor->op == GGML_OP_PERMUTE) {
+        || tensor->op == GGML_OP_TRANSPOSE
+        || tensor->op == GGML_OP_VIEW
+        || tensor->op == GGML_OP_PERMUTE
+        ) {
         return false;
     }
 
     //TODO: support other op
-    bool supported_op = ((tensor->op == GGML_OP_ADD) || (tensor->op == GGML_OP_MUL_MAT)
-            || (tensor->op == GGML_OP_MUL));
+    bool supported_op = ((tensor->op == GGML_OP_ADD)
+                         || (tensor->op == GGML_OP_MUL_MAT)
+                         || (tensor->op == GGML_OP_MUL)
+                        );
     if (!supported_op) {
         return false;
     }
@@ -2700,14 +2767,14 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     struct ggml_tensor * src0 = tensor->src[0];
     struct ggml_tensor * src1 = tensor->src[1];
 
-    const int64_t ne00 = tensor->src[0]->ne[0];
-    const int64_t ne01 = tensor->src[0]->ne[1];
+    const int64_t ne00  = tensor->src[0]->ne[0];
+    const int64_t ne01  = tensor->src[0]->ne[1];
 
-    const int64_t ne10 = tensor->src[1]->ne[0];
-    const int64_t ne11 = tensor->src[1]->ne[1];
+    const int64_t ne10  = tensor->src[1]->ne[0];
+    const int64_t ne11  = tensor->src[1]->ne[1];
 
-    const int64_t ne0 = tensor->ne[0];
-    const int64_t ne1 = tensor->ne[1];
+    const int64_t ne0   = tensor->ne[0];
+    const int64_t ne1   = tensor->ne[1];
 
     const uint32_t src0_rank = ggml_get_tensor_rank(src0);
     const uint32_t src1_rank = ggml_get_tensor_rank(src1);
@@ -3104,7 +3171,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         Qnn_Param_t out_0_params[] = {
                 {QNN_PARAMTYPE_SCALAR,
                            QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
-                             .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}
+                           .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}
                 }
         };
 
@@ -3154,13 +3221,13 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
 
         //step-6: finalize qnn graph and execute qnn graph
-        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
         Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0, *p_tensor1};
         Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
         CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
                                                input_tensors_0, 2,
                                                output_tensors_0, 1,
-                                               NULL, NULL));
+                                               nullptr, nullptr));
 
         qnn_tensors_t ggml_op_mulmat_tensors;
         ggml_op_mulmat_tensors.reserve(5);
@@ -3318,7 +3385,7 @@ static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = {
         /* .get_tensor      = */ ggml_backend_qnn_buffer_get_tensor,
         /* .cpy_tensor      = */ ggml_backend_qnn_buffer_cpy_tensor,
         /* .clear           = */ ggml_backend_qnn_buffer_clear,
-        /* .reset           = */ NULL,
+        /* .reset           = */ nullptr,
 };
 
 static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) {
@@ -3349,7 +3416,7 @@ static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_typ
     return 32;
 }
 
-//FIXME: this value is an experimental value on Xiaomi14
+//FIXME: this value is an experimental value on Snapdragon 8 Gen3 based phone
 static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
     GGML_UNUSED(buft);
 
@@ -3429,8 +3496,6 @@ static const char * ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) {
         return "unknown";
     }
     return ctx->name;
-
-    GGML_UNUSED(dev);
 }
 
 static const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) {
@@ -3520,10 +3585,10 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) {
                                      /* .alloc_buffer     = */ ggml_backend_qnn_buffer_type_alloc_buffer,
                                      /* .get_alignment    = */ ggml_backend_qnn_buffer_type_get_alignment,
                                      /* .get_max_size     = */ ggml_backend_qnn_buffer_type_get_max_size,
-                                     /* .get_alloc_size   = */ NULL,// defaults to ggml_nbytes
+                                     /* .get_alloc_size   = */ nullptr,// defaults to ggml_nbytes
                                      /* .is_host          = */ ggml_backend_qnn_buffer_is_host
                              },
-            /* .context = */ NULL,
+            /* .context = */ nullptr,
     };
 
     return &ggml_backend_buffer_type_qnn;
@@ -3561,14 +3626,14 @@ static struct ggml_backend_device_i ggml_backend_qnn_device_interface = {
         /* .get_props            = */ ggml_backend_qnn_device_get_props,
         /* .init_backend         = */ ggml_backend_qnn_device_init_backend,
         /* .get_buffer_type      = */ ggml_backend_qnn_device_get_buffer_type,
-        /* .get_host_buffer_type = */ NULL,
+        /* .get_host_buffer_type = */ nullptr,
         /* .buffer_from_host_ptr = */ ggml_backend_qnn_device_buffer_from_host_ptr,
         /* .supports_op          = */ ggml_backend_qnn_device_supports_op,
         /* .supports_buft        = */ ggml_backend_qnn_device_supports_buft,
-        /* .offload_op           = */ NULL,
-        /* .event_new            = */ NULL,
-        /* .event_free           = */ NULL,
-        /* .event_synchronize    = */ NULL,
+        /* .offload_op           = */ nullptr,
+        /* .event_new            = */ nullptr,
+        /* .event_free           = */ nullptr,
+        /* .event_synchronize    = */ nullptr,
 };
 
 static ggml_backend_i ggml_backend_qnn_interface = {
@@ -3616,9 +3681,8 @@ struct ggml_backend_qnn_reg_context {
 };
 
 static const char * ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) {
-    return "ggml-qnn";
-
     GGML_UNUSED(reg);
+    return "ggml-qnn";
 }
 
 static size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) {
@@ -3639,10 +3703,15 @@ static ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg
 static void * ggml_backend_qnn_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
     GGML_UNUSED(reg);
 
-    if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
+    if (nullptr == name)
+        return nullptr;
+
+    const char * slot_name =  "ggml_backend_set_n_threads";
+    //avoid buffer attack rather than strcmp
+    if (0 == std::memcmp(name, slot_name, strlen(slot_name))) {
         return (void *)ggml_backend_qnn_set_n_threads;
     }
-    return NULL;
+    return nullptr;
 }
 
 static const ggml_backend_reg_i ggml_backend_qnn_reg_interface = {

From 6540f50e7ad4be243c7397314b758cdf7b0dfd30 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 26 Feb 2025 13:38:12 +0800
Subject: [PATCH 087/200] ggml-qnn: offload quantized type mulmat to QNN
 backend

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 172 +++++++++++++++++++++++++--------
 1 file changed, 130 insertions(+), 42 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index aaf9fd694f8b9..3a474f1bffee5 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -75,6 +75,7 @@
 #include <unordered_set>
 #include <utility>
 #include <stdatomic.h>
+#include <future>
 #if (defined __ANDROID__) || (defined ANDROID)
 #include "android/log.h"
 #endif
@@ -815,6 +816,11 @@ struct ggml_backend_qnn_context {
     QNN_INTERFACE_VER_TYPE raw_interface;
     QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
     struct qcom_socinfo           socinfo;
+
+    std::unique_ptr<char[]> work_data;
+    std::vector<std::future<void>> tasks;
+    size_t work_size = 0;
+    int n_threads    = GGML_DEFAULT_N_THREADS;
 } ;
 
 //the following helper funcs are used to ensure every QNN tensor name is unique
@@ -2780,7 +2786,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     const uint32_t src1_rank = ggml_get_tensor_rank(src1);
 
     if (tensor->op == GGML_OP_ADD) {
-        //dump_tensors_info(tensor);
+        //dump_op_info(tensor);
         if (!ggml_are_same_shape(src0, src1)) {
             return false;
         }
@@ -2791,6 +2797,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     }
 
     if (tensor->op == GGML_OP_MUL_MAT) {
+        dump_op_info(tensor);
         if (src0_rank != src1_rank) // make QNN SDK happy
             return false;
         if (src0_rank < 2) // QNN's limitation, make QNN SDK happy
@@ -2800,17 +2807,18 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
         if ((src1->ne[2] != src0->ne[2]) || (src1->ne[3] != src0->ne[3])) // make QNN SDK happy
             return false;
 
-        //TODO: support more data type in func ggml_qnn_mul_mat(...)
-        //src0: q4_0, q6_k, ...
-        //src1: f32
-        //dst : f32
-        return  (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
-                && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
-                && (src0->type == src1->type) && (src0->type == tensor->type);
+        if (2 != src0_rank) { //TODO: quantize src0 for 3D & 4D matrix
+            return (src0->type == GGML_TYPE_F32)
+                   && (src1->type == GGML_TYPE_F32)
+                   && (tensor->type == GGML_TYPE_F32);
+        } else {
+            return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q6_K)
+                   && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
+        }
     }
 
     if (tensor->op == GGML_OP_MUL) {
-        //dump_tensors_info(tensor);
+        //dump_op_info(tensor);
         if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix
             return false;
         return  (src0->type == GGML_TYPE_F32)
@@ -2870,7 +2878,9 @@ static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
         p_tensor1 = ggml_qnn_create_compute_tensor(src1);
         p_tensor2 = ggml_qnn_create_compute_tensor(dst);
     }
+#if GGMLQNN_PRINT_OP_ADD_LOG
     print_tensors_info(__func__, ctx, src0, src1, dst);
+#endif
 
     //ensure QNN tensor has correct tensor type
     QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
@@ -2966,7 +2976,6 @@ static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
 
         auto  graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
         instance->_qnn_graph_map[graph_name] = graph_item;
-
     } else {
         Qnn_DataType_t src0_qnn_type    = QNN_DATATYPE_FLOAT_32;
         Qnn_DataType_t src1_qnn_type    = QNN_DATATYPE_FLOAT_32;
@@ -3039,22 +3048,31 @@ static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
     QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
     QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
     QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
+
+#if GGMLQNN_PRINT_OP_ADD_LOG
     op_perf.info();
+#endif
 }
 
 /*
- * the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
- * than ggml_qnn_general_node.
- * matrix transpose and type trait are required for offload mulmat to QNN backend,
- * so it's a standalone function. accordingly, this is another typical skeleton for offload other
- * ggml ops to QNN backend
+ * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
+ *        using the QNN backend. this function performs matrix multiplication of the input tensor
+ *        `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
+ *        and stores the result in the destination tensor `dst`.
  *
- * MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, should focus on MUL_MAT.
+ * @param backend the context which got through (ggml_backend_qnn_context *)backend->context for the
+ *                QNN backend operations.
+ * @param op      the destination tensor where the result of the matrix multiplication will be stored.
  *
- * have three kinds of MUL_MAT to compute:
- * mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
- * mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
- * mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, ...) and src1 is F32, src0 -> f32 in src0', then src0' * src1
+ * @note the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
+ *       than ggml_qnn_general_node. so it's a standalone function. accordingly, this is another
+ *       typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute
+ *       time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds
+ *       of MUL_MAT to compute:
+ *       mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
+ *       mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
+ *       mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, Q6_K...)
+ *                        and src1 is F32, src0 -> f32 in src0', then src0' * src1
 */
 static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
     Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
@@ -3077,10 +3095,72 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
     op_perf.start();
 
-    uint32_t src0_rank = ggml_get_tensor_rank(src0);
-    uint32_t src1_rank = ggml_get_tensor_rank(src1);
+    const enum ggml_type type                   = src0->type;
+    const uint32_t src0_rank                    = ggml_get_tensor_rank(src0);
+    const uint32_t src1_rank                    = ggml_get_tensor_rank(src1);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+    GGML_ASSERT(nb00 == ggml_type_size(type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
     GGML_ASSERT(src0_rank == src1_rank);
-    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation
+    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
+
+    // broadcast factors
+    const int64_t r2            = ne12  / ne02;
+    const int64_t r3            = ne13  / ne03;
+    const int64_t ne_plane      = ne01  * ne00;
+    const size_t  desired_size  = ((GGML_TYPE_F32 == type) ? 0 : ne03 * ne02 * ne_plane * sizeof(float));
+    if (ctx->work_size < desired_size) {
+        ctx->work_data.reset(new char[desired_size]);
+        ctx->work_size = desired_size;
+    }
+    void * wdata = ctx->work_data.get();
+    // convert src0 to float
+    if (type != GGML_TYPE_F32) {
+        const auto * type_traits = ggml_get_type_traits(type);
+        ggml_to_float_t const to_float = type_traits->to_float;
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                const void  * x      = (char *)src0->data + i02 * nb02     + i03 * nb03;
+                float * const wplane = (float *)wdata     + i02 * ne_plane + i03 * ne02 * ne_plane;
+
+                const int min_cols_per_thread = 4096;
+                const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1);
+                const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1);
+                for (int i = 1; i < n_threads; i++) {
+                    const int64_t start = i * ne01 / n_threads;
+                    const int64_t end   = (i + 1) * ne01 / n_threads;
+                    if (start < end) {
+                        ctx->tasks.push_back(std::async(std::launch::async, [=]() {
+                            for (int64_t i01 = start; i01 < end; i01++) {
+                                to_float((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
+                            }
+                        }));
+                    }
+                }
+                {
+                    // reuse the current thread for the first task
+                    const int64_t start = 0;
+                    const int64_t end   = ne01 / n_threads;
+                    for (int64_t i01 = start; i01 < end; i01++) {
+                        to_float((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
+                    }
+                }
+            }
+        }
+
+        // wait for all tasks to finish
+        for (auto & task : ctx->tasks) {
+            task.get();
+        }
+        ctx->tasks.clear();
+    }
 
     std::string graph_name;
     get_graph_key_from_op(op, graph_name);
@@ -3133,9 +3213,10 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
 
           2. QNN's MatMul can only support input tensors with rank >= 2
 
-        there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose operation when offloading mulmat to QNN backend.
+             in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
+             operation when offloading mulmat to QNN backend. this concise implementation will handle
+             transpose in func ggml_qnn_create_general_tensor()
         */
-
         //step-1: create qnn graph
         error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
                                               graph_name.c_str(), nullptr, &graph_handle);
@@ -3158,8 +3239,11 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
-
-        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+        if (type != GGML_TYPE_F32) {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
+        } else {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+        }
         QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
         QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
 
@@ -3170,14 +3254,14 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         //step-5: compose qnn graph: add mat_mul node
         Qnn_Param_t out_0_params[] = {
                 {QNN_PARAMTYPE_SCALAR,
-                           QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
-                           .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}
+                 QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
+                        .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}
                 }
         };
 
         Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0, *p_tensor1};
         Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
-#if 0
+#if 0 //leave here for easily understand code, can be removed in the future
         Qnn_OpConfig_t out_0 = {
                 QNN_OPCONFIG_VERSION_1, .v1 =
                         {"ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
@@ -3202,7 +3286,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         };
         Qnn_Tensor_t out_trans1_0_inputs[]  = {*p_tensor2_transpose};
         Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
-#if 0
+#if 0 //leave here for easily understand code, can be removed in the future
         Qnn_OpConfig_t out_trans1_0 = {
                 QNN_OPCONFIG_VERSION_1,
                 .v1 =  {"ggmlqnn_mulmat_transpose_opconfig",
@@ -3216,7 +3300,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         };
 #else
         Qnn_OpConfig_t out_trans1_0 = create_op_config("ggmlqnn_mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
-                         out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
+                                                       out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
 #endif
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
 
@@ -3225,9 +3309,9 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0, *p_tensor1};
         Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
         CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                               input_tensors_0, 2,
-                                               output_tensors_0, 1,
-                                               nullptr, nullptr));
+                                                            input_tensors_0, 2,
+                                                            output_tensors_0, 1,
+                                                            nullptr, nullptr));
 
         qnn_tensors_t ggml_op_mulmat_tensors;
         ggml_op_mulmat_tensors.reserve(5);
@@ -3239,7 +3323,11 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
         instance->_qnn_graph_map[graph_name] = graph_item;
     } else {
-        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+        if (type != GGML_TYPE_F32) {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
+        } else {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
+        }
         QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
         QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
 
@@ -3250,13 +3338,13 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
         Qnn_Tensor_t tensor_outputs[] = {
                 *p_tensor2
         };
-        // this is the second technical approach of "how to utilize the Hexagon NPU maximally" through
-        // QNN SDK, details could be found at
-        // https://github.com/kantv-ai/llama.cpp/wiki/mapping-ggml-compute-graph-to-QNN-compute-graph
+        // this is the second technical approach or another pipeline of "how to utilize the Hexagon
+        // NPU maximally" through QNN SDK, details could be found at
+        // https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
         CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                         tensor_inputs, 2,
-                                         tensor_outputs, 1,
-                                         nullptr, nullptr));
+                                                            tensor_inputs, 2,
+                                                            tensor_outputs, 1,
+                                                            nullptr, nullptr));
     }
 
     // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor

From 612b572744e012a9b697e4ad6a621b8f333e8b31 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 27 Feb 2025 14:34:10 +0800
Subject: [PATCH 088/200] ggml-qnn: refine source code structure to make code
 more clearly

---
 ggml/src/ggml-qnn/ggml-qnn-impl.h  |  138 +++
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp |  140 +++
 ggml/src/ggml-qnn/ggml-qnn-ops.h   |    5 +
 ggml/src/ggml-qnn/ggml-qnn.cpp     | 1662 ++++++----------------------
 scripts/build-run-android.sh       |  282 -----
 5 files changed, 630 insertions(+), 1597 deletions(-)
 delete mode 100755 scripts/build-run-android.sh

diff --git a/ggml/src/ggml-qnn/ggml-qnn-impl.h b/ggml/src/ggml-qnn/ggml-qnn-impl.h
index 5a2fe5752a097..974755955f9d2 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-impl.h
+++ b/ggml/src/ggml-qnn/ggml-qnn-impl.h
@@ -64,8 +64,12 @@
 #include "android/log.h"
 #endif
 
+<<<<<<< HEAD
 #if defined(_WIN32)
 #include <wchar.h>
+=======
+#if defined(_WIN32) || defined(_MSC_VER)
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 #include <Windows.h>
 #endif
 
@@ -90,7 +94,11 @@ class  qnn_instance;
 struct ggml_backend_qnn_context;
 void   ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
 
+<<<<<<< HEAD
 #if 0//def NDEBUG
+=======
+#ifdef NDEBUG
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 #define GGMLQNN_DEBUG                           0
 #define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
 #define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
@@ -105,9 +113,15 @@ void   ggmlqnn_log_internal(ggml_log_level level, const char * file, const char
 #endif
 #define GGML_QNN_LOGBUF_LEN                     4096
 
+<<<<<<< HEAD
 #define GGMLQNN_LOG_ERROR(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #define GGMLQNN_LOG_WARN(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #define GGMLQNN_LOG_INFO(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+=======
+#define GGMLQNN_LOG_ERROR(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG,  __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_WARN(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_INFO(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 
 #if GGMLQNN_DEBUG
 #define GGMLQNN_LOG_DEBUG(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
@@ -141,6 +155,7 @@ void   ggmlqnn_log_internal(ggml_log_level level, const char * file, const char
 
 #define GQCGT                                   ggmlqnn_create_general_tensor
 
+<<<<<<< HEAD
 #if defined(_WIN32)
 #define RTLD_GLOBAL 0x100
 #define RTLD_LOCAL  0x000
@@ -152,6 +167,8 @@ void *              dlsym(void* handle, const char* name);
 const char *        dlerror(void);
 #endif
 
+=======
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 using pfn_rpc_mem_init                          = void (*)(void);
 using pfn_rpc_mem_deinit                        = void (*)(void);
 using pfn_rpc_mem_alloc                         = void *(*)(int, uint32_t, int);
@@ -226,7 +243,11 @@ struct qnn_op_caps_t {
     const size_t input_param_count  = 0;
     const char * qnn_param_name     = nullptr;
 };
+<<<<<<< HEAD
 extern const qnn_op_caps_t ggmlqnn_k_op_caps[];
+=======
+extern const qnn_op_caps_t k_op_caps[];
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 
 #if ENABLE_QNNBACKEND_PERF
 class qnn_perf {
@@ -255,9 +276,13 @@ class qnn_perf {
 #else
 class qnn_perf {
 public:
+<<<<<<< HEAD
     qnn_perf(const std::string & perf_name) {
         GGML_UNUSED(perf_name);
     }
+=======
+    qnn_perf(const std::string & perf_name) {}
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
     qnn_perf() = delete;
     qnn_perf(const qnn_perf & ) = delete;
     qnn_perf & operator= (const qnn_perf & ) = delete;
@@ -289,6 +314,7 @@ class qnn_interface {
     qnn_interface() = default;
 
     // QnnBackend
+<<<<<<< HEAD
     DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate)
 
     DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree)
@@ -369,6 +395,88 @@ class qnn_interface {
     DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo)
 
     DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree)
+=======
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion);
+
+    // QnnDevice
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo);
+
+    // QnnContext
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree);
+
+    // QnnGraph
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve);
+
+    // QnnLog
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel);
+
+    // QnnProfile
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree);
+
+    // QnnMem
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister);
+
+    // QnnProperty
+    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability);
+
+    // QnnTensor
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor);
+
+    // QnnSystem
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate);
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo);
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree);
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 
     void set_qnn_interface(const QnnInterface_t * qnn_interface) {
         _qnn_interface = qnn_interface;
@@ -387,9 +495,15 @@ class qnn_interface {
     }
 
 private:
+<<<<<<< HEAD
     const QnnInterface_t * _qnn_interface           = nullptr;
 
     const QnnSystemInterface_t * _qnn_sys_interface = nullptr;
+=======
+    const QnnInterface_t *_qnn_interface = nullptr;
+
+    const QnnSystemInterface_t *_qnn_sys_interface = nullptr;
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 };
 
 class qnn_instance {
@@ -400,7 +514,11 @@ class qnn_instance {
                           const std::string & model_name) :
             _lib_path(std::move(lib_path)),
             _backend_name(std::move(backend_name)),
+<<<<<<< HEAD
             _model_name(std::move(model_name)) {}
+=======
+            _model_name(std::move(model_name)) {};
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 
     ~qnn_instance() {
     }
@@ -430,6 +548,7 @@ class qnn_instance {
         return _qnn_raw_system_interface;
     }
 
+<<<<<<< HEAD
     Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
 
     Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
@@ -443,6 +562,21 @@ class qnn_instance {
     QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
 
     Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
+=======
+    const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
+
+    const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
+
+    const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
+
+    const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
+
+    const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
+
+    const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
+
+    const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 
     int init_qnn_graph(const char * graph_name,
                        bool debug,
@@ -596,6 +730,10 @@ const char   * ggmlqnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code);
 Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype);
 void         * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op);
 void           ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & output);
+<<<<<<< HEAD
+=======
+bool           ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 uint8_t      * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata);
 void           ggmlqnn_print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
 
diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 00cb7da32c183..6614a1b90f6fd 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -23,6 +23,7 @@
 #include "ggml-common.h"
 #include "ggml-qnn-ops.h"
 
+<<<<<<< HEAD
 static inline uint32_t ggmlqnn_get_tensor_data_size(const ggml_tensor * tensor) {
     /*
     size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
@@ -52,6 +53,8 @@ static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const
     return true;
 }
 
+=======
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 #define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                          \
     do {                                                                    \
         if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {       \
@@ -82,7 +85,11 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
     size_t qnn_op_index                         = ggmlqnn_get_op_index(op);
     GGML_ASSERT(qnn_op_index < ggmlqnn_get_opcaps_size());
+<<<<<<< HEAD
     const char * qnn_op_name                    = ggmlqnn_k_op_caps[qnn_op_index].qnn_op_name;
+=======
+    const char * qnn_op_name                    = k_op_caps[qnn_op_index].qnn_op_name;
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
     std::string ggml_op_name_string             = std::string("ggml_") + ggml_op_name(op->op);
     const char * ggml_op_name                   = ggml_op_name_string.c_str();
 
@@ -104,7 +111,13 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         p_tensor1 = ggmlqnn_create_compute_tensor(src1);
         p_tensor2 = ggmlqnn_create_compute_tensor(dst);
     }
+<<<<<<< HEAD
     //ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
+=======
+#if GGMLQNN_PRINT_OP_ADD_LOG
+    print_tensors_info(__func__, ctx, src0, src1, dst);
+#endif
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 
     //ensure QNN tensor has correct tensor type
     QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
@@ -152,9 +165,15 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
                 return;
             }
         } else {
+<<<<<<< HEAD
             QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
             QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
             QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+=======
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
+            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
         }
 
         Qnn_Tensor_t tensor_inputs[] = {
@@ -242,9 +261,15 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
                 memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
             }
         } else {
+<<<<<<< HEAD
             QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
             QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
             QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+=======
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
+            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
         }
 
         Qnn_Tensor_t tensor_inputs[] = {
@@ -279,6 +304,7 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
 }
 
 /*
+<<<<<<< HEAD
  * this function is AI-assisted code from Grok 3 for purpose of offload 4d matrix mulmat to QNN backend
  * UT in ggml-qnn-ut.cpp passed:
  * ./scripts/build-run-android.sh run_ut_mulmat 0
@@ -474,6 +500,8 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
 }
 
 /*
+=======
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
  * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
  *        using the QNN backend. this function performs matrix multiplication of the input tensor
  *        `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
@@ -518,9 +546,13 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     const uint32_t src1_rank                    = ggml_n_dims(src1);
     GGML_ASSERT(src0_rank == src1_rank);
     GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
+<<<<<<< HEAD
     if (4 == src0_rank) {
         return ggml_qnn_mul_mat_4d(ctx, op);
     }
+=======
+    GGML_ASSERT(src0_rank != 4); //TODO: 4D matrix mulmat
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
     void * wdata                                = ggmlqnn_type_trait(ctx, op);
     const size_t desired_size                   = ctx->desired_size;
 
@@ -604,10 +636,17 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         if (src0_type != GGML_TYPE_F32) {
             QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
         } else {
+<<<<<<< HEAD
             QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
         }
         QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
         QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+=======
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
+        }
+        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 
         //step-4: create a transpose tensor
         p_tensor2_transpose = GQCGT(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0, true);
@@ -636,13 +675,21 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         };
 #else
         Qnn_OpConfig_t out_0 = ggmlqnn_create_op_config("ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
+<<<<<<< HEAD
                                                         out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
+=======
+                                                out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 #endif
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
 
         //step-5: compose qnn graph: add transpose node
         Qnn_Param_t out_trans1_0_params[] = {
+<<<<<<< HEAD
                 {QNN_PARAMTYPE_TENSOR,
+=======
+                {(Qnn_ParamType_t) 1,
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
                  "perm", .tensorParam = *p_param_tensor
                 }
         };
@@ -662,7 +709,11 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         };
 #else
         Qnn_OpConfig_t out_trans1_0 = ggmlqnn_create_op_config("ggmlqnn_mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
+<<<<<<< HEAD
                                                                out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
+=======
+                                                       out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 #endif
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
 
@@ -688,10 +739,17 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         if (src0_type != GGML_TYPE_F32) {
             QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
         } else {
+<<<<<<< HEAD
             QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
         }
         QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
         QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+=======
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
+        }
+        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 
         Qnn_Tensor_t tensor_inputs[] = {
                 *p_tensor0,
@@ -715,6 +773,7 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
     op_perf.info();
 }
+<<<<<<< HEAD
 
 void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
@@ -820,6 +879,78 @@ void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
 void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
+=======
+void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_add(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+static void ggml_qnn_avg_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+static void ggml_qnn_max_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value) {
+}
+
+void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 }
 
 void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
@@ -827,6 +958,7 @@ void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
 }
 
 void ggml_qnn_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+<<<<<<< HEAD
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
@@ -839,4 +971,12 @@ void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
 void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
+=======
+}
+
+void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 }
diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.h b/ggml/src/ggml-qnn/ggml-qnn-ops.h
index b1c388a32a87a..c25638a9397c6 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.h
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.h
@@ -24,8 +24,13 @@
 #include "ggml-qnn-impl.h"
 void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+<<<<<<< HEAD
 
 void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+=======
+void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+void ggml_qnn_add(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+>>>>>>> ggml-qnn: refine source code structure to make code more clearly
 void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 void ggml_qnn_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 3a474f1bffee5..ff1a8a0f39506 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -13,7 +13,7 @@
  * section-5 does ggml-qnn backend helper macro / data structure / function / class
  * section-6 does implementation of ggml-qnn backend according to ggml's backend subsystem
  *
- * currently provide following ggml ops' QNN backend implementation:
+ * currently provide following ggml ops' QNN backend implementation in ggml-qnn-ops.cpp:
  * - GGML_OP_ADD:    this is a simple skeleton, can expand other ggml ops according to expertise
  * - GGML_OP_MUL:    this is a simple skeleton, can expand other ggml ops according to expertise
  * - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex ggml ops accordingly
@@ -36,105 +36,19 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stddef.h>
-#include <inttypes.h>
-#include <math.h>
-#include <time.h>
-#if defined(__ANDROID__) || defined(__linux__)
-#include <unistd.h>
-#include <dlfcn.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/sysinfo.h>
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <vector>
-#include <thread>
-#include <mutex>
-#include <map>
-#include <set>
-#include <tuple>
-#include <queue>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <chrono>
-#include <memory>
-#include <regex>
-#include <random>
-#include <functional>
-#include <unordered_map>
-#include <condition_variable>
-#include <cassert>
-#include <unordered_set>
-#include <utility>
-#include <stdatomic.h>
-#include <future>
-#if (defined __ANDROID__) || (defined ANDROID)
-#include "android/log.h"
-#endif
-
-#if defined(_WIN32) || defined(_MSC_VER)
-#include <Windows.h>
-#endif
-
-#include "QnnTypes.h"
-#include "QnnCommon.h"
-#include "QnnContext.h"
-#include "QnnBackend.h"
-#include "QnnGraph.h"
-#include "QnnProperty.h"
-#include "QnnTensor.h"
-#include "QnnInterface.h"
-#include "Saver/QnnSaver.h"
-#include "System/QnnSystemInterface.h"
-#include "HTP/QnnHtpDevice.h"
-#include "HTP/QnnHtpGraph.h"
-
-#include "ggml-qnn.h"
-#include "ggml-impl.h"
-#include "ggml-backend-impl.h"
+#include "ggml-qnn-impl.h"
+#include "ggml-qnn-ops.h"
 // =================================================================================================
 //  section-1: forward/external declaration
 // =================================================================================================
-class qnn_instance;
-struct ggml_backend_qnn_context;
 static int  free_qnn_tensor(Qnn_Tensor_t * tensor);
 static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
-static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
-                                                     Qnn_TensorType_t qnn_tensor_type,
-                                                     Qnn_DataType_t qnn_data_type,
-                                                     uint32_t rank, uint32_t * dims,
-                                                     void * data, uint32_t data_size,
-                                                     bool b_transpose = false);
+typedef void (* ggmlqnn_op_func_t)(ggml_backend_qnn_context * ctx, ggml_tensor * op);
 
 // =================================================================================================
 //  section-2: ggml-qnn internal troubleshooting function
 // =================================================================================================
-#define GGMLQNN_DEBUG                           1  // for troubleshooting QNN backend
-#define GGML_QNN_LOGBUF_LEN                     4096
-#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
-#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
-#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
-#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            1
-
-#define GGMLQNN_LOG_ERROR(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG,  __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLQNN_LOG_WARN(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLQNN_LOG_INFO(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-
-#if GGMLQNN_DEBUG
-#define GGMLQNN_LOG_DEBUG(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#else
-#define GGMLQNN_LOG_DEBUG(...)
-#endif
-static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
+void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
     static std::mutex ggmlqnn_log_internal_mutex;
     static char s_ggmlqnn_log_internal_buf[GGML_QNN_LOGBUF_LEN];
 
@@ -160,16 +74,6 @@ static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const
 // =================================================================================================
 //  section-3: general helper macro / data structure / function
 // =================================================================================================
-#define DISABLE_COPY(class_name)                \
-    class_name(const class_name &) = delete;    \
-    void operator=(const class_name &) = delete
-
-#define DISABLE_MOVE(class_name)                \
-    class_name(class_name &&) = delete;         \
-    void operator=(class_name &&) = delete
-
-#define GQCGT                                   ggml_qnn_create_general_tensor
-
 static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
     return offset % alignment == 0 ? offset
                                    : offset +
@@ -249,19 +153,6 @@ static void * ggmlqnn_host_malloc(size_t n) {
 // =================================================================================================
 //  section-4: QNN helper macro / data structure / function
 // =================================================================================================
-#define CHECK_QNN_API(error, result)                                            \
-    do {                                                                        \
-        error = (result);                                                       \
-        if (QNN_SUCCESS != error) {                                             \
-            if (error == QNN_COMMON_ERROR_NOT_SUPPORTED) {                      \
-                GGMLQNN_LOG_WARN("WARNING: QNN feature/API not supported\n");   \
-            } else {                                                            \
-                GGMLQNN_LOG_INFO("QNN API error = %d(%s)\n", error, qnn_get_error_string(error));  \
-            }                                                                   \
-        }                                                                       \
-    } while (0)
-
-#define QNN_VER_PTR(x)                                  (&((x).v1))
 #define QNN_TENSOR_GET_ID(tensor)                       get_qnn_tensorid(tensor)
 #define QNN_TENSOR_GET_NAME(tensor)                     get_qnn_tensorname(tensor)
 #define QNN_TENSOR_GET_TYPE(tensor)                     get_qnn_tensortype(tensor)
@@ -420,9 +311,8 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
     int err = 0;
 
     dst.version = src.version;
-    QNN_TENSOR_SET_NAME(
-            dst, ggmlqnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size()));
-    if (QNN_TENSOR_GET_NAME(dst) == nullptr) {
+    QNN_TENSOR_SET_NAME(dst, ggmlqnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size()));
+    if (nullptr == QNN_TENSOR_GET_NAME(dst)) {
         return 1;
     }
     QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src));
@@ -441,20 +331,20 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
     }
 
     Qnn_QuantizeParams_t src_qparam      = QNN_TENSOR_GET_QUANT_PARAMS(src);
-    Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding;
+    Qnn_QuantizationEncoding_t encoding  = src_qparam.quantizationEncoding;
     if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
-        Qnn_QuantizeParams_t src_qparam_cpy      = src_qparam;
+        Qnn_QuantizeParams_t src_qparam_cpy       = src_qparam;
         Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding;
-        Qnn_ScaleOffset_t ** scale_offset          = &axis_scale_offset.scaleOffset;
+        Qnn_ScaleOffset_t ** scale_offset         = &axis_scale_offset.scaleOffset;
         size_t scale_offset_size = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t);
-        *scale_offset           = (Qnn_ScaleOffset_t *)malloc(scale_offset_size);
+        *scale_offset            = (Qnn_ScaleOffset_t *)malloc(scale_offset_size);
         ggmlqnn_memscpy(*scale_offset,
                         scale_offset_size,
                         src_qparam.axisScaleOffsetEncoding.scaleOffset,
                         scale_offset_size);
         QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy);
     } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
-        Qnn_QuantizeParams_t src_qparam_cpy          = src_qparam;
+        Qnn_QuantizeParams_t src_qparam_cpy           = src_qparam;
         Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding;
         size_t scale_size                          = bwaxis_scale_offset.numElements * sizeof(float);
         float ** scales                            = &bwaxis_scale_offset.scales;
@@ -476,7 +366,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
     QNN_TENSOR_SET_RANK(dst, rank);
     size_t dim_size       = GGML_MAX_DIMS * sizeof(uint32_t);
     uint32_t * dimensions = (uint32_t *)malloc(dim_size);
-    if (dimensions == nullptr) {
+    if (nullptr == dimensions) {
         GGMLQNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src));
         return 1;
     }
@@ -488,10 +378,8 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
 
 static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
     int err = 0;
-
     free((void *) QNN_TENSOR_GET_NAME(*tensor));
-
-    Qnn_QuantizeParams_t src_qparam      = QNN_TENSOR_GET_QUANT_PARAMS(*tensor);
+    Qnn_QuantizeParams_t src_qparam     = QNN_TENSOR_GET_QUANT_PARAMS(*tensor);
     Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding;
     if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
         free(src_qparam.axisScaleOffsetEncoding.scaleOffset);
@@ -507,54 +395,7 @@ static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
     return err;
 }
 
-static size_t qnn_datatype_size(Qnn_DataType_t qnn_type) {
-    switch (qnn_type) {
-        case QNN_DATATYPE_FLOAT_32:
-            return sizeof(float);
-        case QNN_DATATYPE_FLOAT_16:
-            return sizeof(uint16_t);
-        case QNN_DATATYPE_UINT_32:
-        case QNN_DATATYPE_INT_32:
-            return sizeof(int32_t);
-        case QNN_DATATYPE_INT_16:
-            return sizeof(int16_t);
-        case QNN_DATATYPE_INT_8:
-            return sizeof(int8_t);
-        case QNN_DATATYPE_SFIXED_POINT_8:
-            return sizeof(int8_t);
-        case QNN_DATATYPE_SFIXED_POINT_4:
-            return sizeof(int8_t);
-        default:
-            break;
-    }
-    return 0;
-}
-
-static const char * qnn_datatype_to_string(Qnn_DataType_t qnn_type) {
-    switch (qnn_type) {
-        case QNN_DATATYPE_FLOAT_32:
-            return "QNN_DATATYPE_FLOAT_32";
-        case QNN_DATATYPE_FLOAT_16:
-            return "QNN_DATATYPE_FLOAT_16";
-        case QNN_DATATYPE_UINT_32:
-            return "QNN_DATATYPE_UINT_32";
-        case QNN_DATATYPE_INT_32:
-            return "QNN_DATATYPE_INT_32";
-        case QNN_DATATYPE_INT_16:
-            return "QNN_DATATYPE_INT_16";
-        case QNN_DATATYPE_INT_8:
-            return "QNN_DATATYPE_INT_8";
-        case QNN_DATATYPE_SFIXED_POINT_8:
-            return "QNN_DATATYPE_SFIXED_POINT_8";
-        case QNN_DATATYPE_SFIXED_POINT_4:
-            return "QNN_DATATYPE_SFIXED_POINT_4";
-        default:
-            break;
-    }
-    return "QNN_DATATYPE_UNDEFINED";
-}
-
-static const char * qnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
+const char * ggmlqnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
     // file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/api_error_codes.html
     switch (qnn_error_code) {
         case QNN_SUCCESS:
@@ -658,7 +499,7 @@ static const char * qnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
 }
 
 // helper function to create an operation config
-static Qnn_OpConfig_t create_op_config(const char * name, const char * package, const char * type,
+Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
                                        Qnn_Param_t * params, uint32_t num_params,
                                        Qnn_Tensor_t * inputs, uint32_t num_inputs,
                                        Qnn_Tensor_t * outputs, uint32_t num_outputs) {
@@ -674,61 +515,6 @@ static Qnn_OpConfig_t create_op_config(const char * name, const char * package,
 // =================================================================================================
 //  section-5:ggml-qnn backend helper macro / data structure / function / class
 // =================================================================================================
-#define RPCMEM_DEFAULT_FLAGS                    1
-#define RPCMEM_HEAP_ID_SYSTEM                   25
-
-typedef void (* ggmlqnn_op_func_t)(ggml_backend_t backend, ggml_tensor * op);
-
-using pfn_rpc_mem_init                                  = void (*)(void);
-using pfn_rpc_mem_deinit                                = void (*)(void);
-using pfn_rpc_mem_alloc                                 = void *(*)(int, uint32_t, int);
-using pfn_rpc_mem_free                                  = void (*)(void *);
-using pfn_rpc_mem_to_fd                                 = int (*)(void *);
-using _pfn_QnnSaver_initialize                          = decltype(QnnSaver_initialize);
-using _pfn_QnnInterface_getProviders                    = decltype(QnnInterface_getProviders);
-using _pfn_QnnSystemInterface_getProviders              = decltype(QnnSystemInterface_getProviders);
-
-using qnn_res_t                                         = std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>;
-using qnn_tensors_t                                     = std::vector< Qnn_Tensor_t *>;
-
-enum class ggml_qnn_profile_level {
-    profile_off     = 0,
-    profile_basic   = 1,
-    profile_detail  = 2
-};
-
-enum qcom_htp_arch {
-    NONE = 0,
-    V68 = 68,
-    V69 = 69,
-    V73 = 73,
-    V75 = 75,
-    V79 = 79,
-};
-
-enum qcom_chipset_soc_model {
-    UNKNOWN_SM = 0,
-    SM7450 = 41,  // v69, 7 Gen1
-    SM8350 = 30,  // v68, 888
-    SM8450 = 36,  // v69, SD 8 Gen 1
-    SM8475 = 42,  // v69, SD 8+ Gen 1
-    SM8550 = 43,  // v73, SD 8 Gen 2
-    SM8650 = 57,  // v75, SD 8 Gen 3
-    SM8750 = 69,  // v79, SD 8 Gen 4
-#if defined(_WIN32) || defined(_MSC_VER)
-    SC7280X     = 44,
-    SC8280X     = 37,
-    SC8380XP    = 60,
-#endif
-};
-
-struct qcom_socinfo {
-    uint32_t soc_model;
-    size_t htp_arch;
-    size_t vtcm_size_in_mb;
-    char soc_desc[GGML_MAX_NAME];
-};
-
 //file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
 static struct qcom_socinfo g_qnn_soc_info_table[] = {
         /* Qualcomm SnapDragon 7 Gen 1 */
@@ -780,7 +566,7 @@ static struct qcom_socinfo g_qnn_soc_info_table[] = {
                 .vtcm_size_in_mb   = 8,
                 .soc_desc          = "Qualcomm SnapDragon 8 Gen 4"},
 
-#if defined(_WIN32) || defined(_MSC_VER)
+#if defined(_MSC_VER)
         /* Qualcomm SnapDragon 7c Gen 2 */
         [SC7280X] = {
                 .soc_model         = SC7280X,
@@ -805,24 +591,6 @@ static struct qcom_socinfo g_qnn_soc_info_table[] = {
 
 };
 
-struct ggml_backend_qnn_context {
-    int device;
-    int threads;
-    char name[GGML_MAX_NAME];
-    char desc[GGML_MAX_NAME];
-    char lib[GGML_MAX_NAME];
-    qnn_instance * instance;
-    struct ggml_backend * backend;
-    QNN_INTERFACE_VER_TYPE raw_interface;
-    QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
-    struct qcom_socinfo           socinfo;
-
-    std::unique_ptr<char[]> work_data;
-    std::vector<std::future<void>> tasks;
-    size_t work_size = 0;
-    int n_threads    = GGML_DEFAULT_N_THREADS;
-} ;
-
 //the following helper funcs are used to ensure every QNN tensor name is unique
 static std::atomic<int32_t>  g_ggmltensor_idx(0);
 static void reset_idx() {
@@ -848,7 +616,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .threads              = 1,
                 .name                 = "qnn-cpu",
                 .desc                 = "Qualcomm Kryo CPU",
-#if defined(_WIN32) || defined(_MSC_VER)
+#if defined(_MSC_VER)
                 .lib                  = "QnnCpu.dll",
 #else
                 .lib                  = "libQnnCpu.so",
@@ -863,7 +631,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .threads              = 1,
                 .name                 = "qnn-gpu",
                 .desc                 = "Qualcomm Adreno GPU",
-#if defined(_WIN32) || defined(_MSC_VER)
+#if defined(_MSC_VER)
                 .lib                  = "QnnGpu.dll",
 #else
                 .lib                  = "libQnnGpu.so",
@@ -878,7 +646,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .threads              = 1,
                 .name                 = "qnn-npu",
                 .desc                 = "Qualcomm NPU(Hexagon Tensor Processor)",
-#if defined(_WIN32) || defined(_MSC_VER)
+#if defined(_MSC_VER)
                 .lib                  = "QnnHtp.dll",
 #else
                 .lib                  = "libQnnHtp.so",
@@ -890,13 +658,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .socinfo              = {}},
 };
 
-struct qnn_op_caps_t {
-    const char * qnn_op_name = nullptr;
-    const size_t input_param_count = 0;
-    const char * qnn_param_name = nullptr;
-};
-
-static const qnn_op_caps_t k_op_caps[] = {
+const qnn_op_caps_t k_op_caps[] = {
         {}, // GGML_OP_NONE
         {}, // GGML_OP_DUP
         {
@@ -1056,54 +818,6 @@ static struct qcom_socinfo * qnn_get_socinfo_from_socmodel(uint32_t soc_model) {
     return nullptr;
 }
 
-static bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                                    const ggml_tensor * src1, ggml_tensor * dst) {
-    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
-        GGMLQNN_LOG_WARN("invalid params\n");
-        return false;
-    }
-
-    qnn_instance * instance = ctx->instance;
-    if (nullptr == instance) {
-        GGMLQNN_LOG_WARN("invalid params\n");
-        return false;
-    }
-
-    return true;
-}
-
-#define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                          \
-    do {                                                            \
-        if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {   \
-            return;                                                 \
-        }                                                           \
-    } while (0)
-
-static uint32_t ggml_get_tensor_rank(const ggml_tensor * tensor) {
-    /*
-    uint32_t rank = 0;
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) {
-            rank++;
-        }
-    }
-    return rank;
-    */
-    return ggml_n_dims(tensor);
-}
-
-static uint32_t ggml_get_tensor_data_size(const ggml_tensor * tensor) {
-    /*
-    size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
-    size_t n_dims = ggml_get_tensor_rank(tensor);
-    for (int i = 1; i < n_dims; i++) {
-        data_size *= tensor->ne[i];
-    }
-
-    return data_size;
-    */
-    return ggml_nbytes(tensor);
-}
 
 static const char * ggml_get_type_name(ggml_type type) {
     const struct ggml_type_traits * traits = ggml_get_type_traits(type);
@@ -1115,9 +829,8 @@ static const char * get_ggml_type_name(ggml_type type) {
     return traits->type_name;
 }
 
-//TODO:
 // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
-static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
+Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
     switch (ggmltype) {
         case GGML_TYPE_F16:
             return QNN_DATATYPE_FLOAT_16;
@@ -1135,7 +848,6 @@ static Qnn_DataType_t qnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
     return QNN_DATATYPE_UNDEFINED;
 }
 
-//TODO:
 static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
     switch (qnn_type) {
         case QNN_DATATYPE_FLOAT_32:
@@ -1190,7 +902,7 @@ static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, c
     }
 }
 
-static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
+Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
                                                      Qnn_TensorType_t qnn_tensor_type,
                                                      Qnn_DataType_t qnn_data_type,
                                                      uint32_t rank, uint32_t * dims,
@@ -1227,7 +939,7 @@ static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor,
     if (b_transpose) {
         GGML_ASSERT(tensor != nullptr); //ensure ggml_tensor is not nullptr for this special case
 
-        get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_get_tensor_rank(tensor));
+        get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_n_dims(tensor));
         tensor_dims = transpose_dims;
 #if 0
         for (size_t idx = 0; idx < 4; idx++) {
@@ -1277,7 +989,7 @@ static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor,
     return p_qnn_tensor;
 }
 
-static Qnn_Tensor_t * ggml_qnn_create_compute_tensor(const ggml_tensor * tensor) {
+Qnn_Tensor_t * ggmlqnn_create_compute_tensor(const ggml_tensor * tensor) {
     uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
                              (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
     Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
@@ -1289,8 +1001,8 @@ static Qnn_Tensor_t * ggml_qnn_create_compute_tensor(const ggml_tensor * tensor)
         qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
     }
 
-    qnn_data_type = qnn_datatype_from_ggml_datatype(tensor->type);
-    Qnn_Tensor_t * p_qnn_tensor = ggml_qnn_create_general_tensor(tensor, nullptr,
+    qnn_data_type = ggmlqnn_datatype_from_ggml_datatype(tensor->type);
+    Qnn_Tensor_t * p_qnn_tensor = ggmlqnn_create_general_tensor(tensor, nullptr,
                                   qnn_tensor_type, qnn_data_type,
                                   ggml_n_dims(tensor), dimensions,
                                   nullptr, 0);
@@ -1298,6 +1010,77 @@ static Qnn_Tensor_t * ggml_qnn_create_compute_tensor(const ggml_tensor * tensor)
     return p_qnn_tensor;
 }
 
+void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
+    const ggml_tensor * src0        = op->src[0];
+    const ggml_tensor * src1        = op->src[1];
+    ggml_tensor * dst               = op;
+    const enum ggml_type src0_type  = src0->type;
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+    GGML_ASSERT(nb00 == ggml_type_size(src0_type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    // broadcast factors
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+    const int64_t ne_plane = ne01 * ne00;
+    const size_t desired_size = ((GGML_TYPE_F32 == src0_type) ? 0 : ne03 * ne02 * ne_plane * sizeof(float));
+    ctx->desired_size   = desired_size;
+    if (ctx->work_size < desired_size) {
+        ctx->work_data.reset(new char[desired_size]);
+        ctx->work_size  = desired_size;
+    }
+    ctx->n_threads = std::thread::hardware_concurrency();
+    void * wdata = ctx->work_data.get();
+    // convert src0 to float
+    if (src0_type != GGML_TYPE_F32) {
+        const auto *type_traits = ggml_get_type_traits(src0_type);
+        ggml_to_float_t const to_float = type_traits->to_float;
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                const void *x = (char *) src0->data + i02 * nb02 + i03 * nb03;
+                float *const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
+
+                const int min_cols_per_thread = 4096;
+                const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1);
+                const int n_threads = std::max(
+                        std::min(ctx->n_threads, (int) (ne01 / min_rows_per_thread)), 1);
+                for (int i = 1; i < n_threads; i++) {
+                    const int64_t start = i * ne01 / n_threads;
+                    const int64_t end = (i + 1) * ne01 / n_threads;
+                    if (start < end) {
+                        ctx->tasks.push_back(std::async(std::launch::async, [=]() {
+                            for (int64_t i01 = start; i01 < end; i01++) {
+                                to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
+                            }
+                        }));
+                    }
+                }
+                {
+                    // reuse the current thread for the first task
+                    const int64_t start = 0;
+                    const int64_t end = ne01 / n_threads;
+                    for (int64_t i01 = start; i01 < end; i01++) {
+                        to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
+                    }
+                }
+            }
+        }
+
+        // wait for all tasks to finish
+        for (auto &task: ctx->tasks) {
+            task.get();
+        }
+        ctx->tasks.clear();
+    }
+    return wdata;
+}
+
 static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) {
     char buffer[256] = {};
     const char * type_name = get_ggml_type_name(tensor->type);
@@ -1323,7 +1106,11 @@ static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & o
     output.append(buffer, len);
 }
 
-static size_t get_qnn_op_index(const ggml_tensor * tensor) {
+size_t ggmlqnn_get_opcaps_size() {
+    return std::size(k_op_caps);
+}
+
+size_t ggmlqnn_get_op_index(const ggml_tensor * tensor) {
     if (tensor->op == GGML_OP_UNARY) {
         return GGML_OP_COUNT + ggml_get_unary_op(tensor);
     }
@@ -1331,17 +1118,17 @@ static size_t get_qnn_op_index(const ggml_tensor * tensor) {
     return tensor->op;
 }
 
-static size_t get_qnn_op_input_param_count(const ggml_tensor * op) {
-    auto op_index = get_qnn_op_index(op);
+static size_t ggmlqnn_get_op_input_param_count(const ggml_tensor * op) {
+    auto op_index = ggmlqnn_get_op_index(op);
     GGML_ASSERT(op_index < std::size(k_op_caps));
     return k_op_caps[op_index].input_param_count;
 }
 
-static void get_graph_key_from_op(const ggml_tensor * op, std::string & output) {
+void ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & output) {
     GGML_ASSERT(op->op != GGML_OP_NONE);
     output += ggml_op_desc(op);
     output += get_ggml_type_name(op->type);
-    size_t param_count = get_qnn_op_input_param_count(op);
+    size_t param_count = ggmlqnn_get_op_input_param_count(op);
     for (size_t i = 0; i < param_count; ++i) {
         auto * input = op->src[i];
         if (!input) {
@@ -1352,42 +1139,21 @@ static void get_graph_key_from_op(const ggml_tensor * op, std::string & output)
     }
 }
 
-#if ENABLE_QNNBACKEND_PERF
-class qnn_perf {
-public:
-    qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
-    qnn_perf() = delete;
-    qnn_perf(const qnn_perf & ) = delete;
-    qnn_perf & operator= (const qnn_perf & ) = delete;
-
-    void start() {
-        _begin_time = ggml_time_us();
+bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
+                             const ggml_tensor * src1, ggml_tensor * dst) {
+    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
+        GGMLQNN_LOG_WARN("invalid params\n");
+        return false;
     }
 
-    void info() {
-        _end_time = ggml_time_us();
-        _duration = (_end_time - _begin_time);
-        GGMLQNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
+    qnn_instance * instance = ctx->instance;
+    if (nullptr == instance) {
+        GGMLQNN_LOG_WARN("invalid params\n");
+        return false;
     }
 
-private:
-    int64_t _begin_time = 0LL;
-    int64_t _end_time   = 0LL;
-    int64_t _duration   = 0LL;
-    std::string _perf_name;
-};
-#else
-class qnn_perf {
-public:
-    qnn_perf(const std::string & perf_name) {}
-    qnn_perf() = delete;
-    qnn_perf(const qnn_perf & ) = delete;
-    qnn_perf & operator= (const qnn_perf & ) = delete;
-
-    void start() {}
-    void info() {}
-};
-#endif
+    return true;
+}
 
 template<typename Fn>
 Fn load_qnn_functionpointers(void * handle, const char * function_name) {
@@ -1401,417 +1167,6 @@ Fn load_qnn_functionpointers(void * handle, const char * function_name) {
 #endif
 }
 
-class qnn_interface {
-
-#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name)           \
-  template <typename... Args>                                     \
-  inline auto qnn_##F(Args... args) const {                       \
-    return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \
-        std::forward<Args>(args)...);                             \
-  }
-
-
-#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name)                  \
-  template <typename... Args>                                                \
-  inline auto qnn_##F(Args... args) const {                                  \
-    return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \
-        std::forward<Args>(args)...);                                        \
-  }
-
-    friend class qnn_instance;
-
-public:
-    qnn_interface() = default;
-
-    // QnnBackend
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion);
-
-    // QnnDevice
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo);
-
-    // QnnContext
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree);
-
-    // QnnGraph
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve);
-
-    // QnnLog
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel);
-
-    // QnnProfile
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree);
-
-    // QnnMem
-    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister);
-
-    // QnnProperty
-    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability);
-
-    // QnnTensor
-    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor);
-
-    // QnnSystem
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate);
-
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo);
-
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree);
-
-    void set_qnn_interface(const QnnInterface_t * qnn_interface) {
-        _qnn_interface = qnn_interface;
-    }
-
-    void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) {
-        _qnn_sys_interface = qnn_sys_interface;
-    }
-
-    uint32_t get_backend_id() const {
-        return _qnn_interface->backendId;
-    }
-
-    bool is_loaded() const {
-        return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr));
-    }
-
-private:
-    const QnnInterface_t *_qnn_interface = nullptr;
-
-    const QnnSystemInterface_t *_qnn_sys_interface = nullptr;
-};
-
-class qnn_instance {
-public:
-    using BackendIdType = decltype(QnnInterface_t{}.backendId);
-
-    explicit qnn_instance(const std::string & lib_path, const std::string & backend_name,
-                          const std::string & model_name) :
-            _lib_path(std::move(lib_path)),
-            _backend_name(std::move(backend_name)),
-            _model_name(std::move(model_name)) {};
-
-    ~qnn_instance() {
-    }
-
-    int qnn_init(const QnnSaver_Config_t ** saver_config);
-
-    int qnn_finalize();
-
-    const qnn_interface & get_qnn_interface() {
-        if (!_qnn_interface.is_loaded()) {
-            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
-        }
-        return _qnn_interface;
-    }
-
-    const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() {
-        if (!_qnn_interface.is_loaded()) {
-            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
-        }
-        return _qnn_raw_interface;
-    }
-
-    const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() {
-        if (!_qnn_interface.is_loaded()) {
-            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
-        }
-        return _qnn_raw_system_interface;
-    }
-
-    const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
-
-    const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
-
-    const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
-
-    const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
-
-    const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
-
-    const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
-
-    const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
-
-    int init_qnn_graph(const char * graph_name,
-                       bool debug,
-                       uint8_t do_node_validation = 1,
-                       const QnnGraph_Config_t ** graph_configs = nullptr
-    );
-    int init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
-
-    int finalize_qnn_graph();
-
-    bool is_valid_graph() const { return _qnn_graph_handle != nullptr; }
-
-    int init_htp_perfinfra() {
-        QnnDevice_Infrastructure_t device_infra = nullptr;
-        int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to get qnn device infra\n");
-            return 1;
-        }
-
-        QnnHtpDevice_Infrastructure_t * htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
-        QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra;
-        uint32_t power_configid = 1;
-        uint32_t device_id = 0;
-        uint32_t core_id = 0;
-        htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid);
-        _qnn_htp_perfinfra = htp_perfinfra;
-        _qnn_power_configid = power_configid;
-
-        return 0;
-    }
-
-    int set_rpc_polling() {
-        if (_qnn_rpc_pollingtime > 0) {
-            QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingtime;
-            memset(&rpc_pollingtime, 0, sizeof(rpc_pollingtime));
-            rpc_pollingtime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
-            rpc_pollingtime.rpcPollingTimeConfig = _qnn_rpc_pollingtime;
-            const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&rpc_pollingtime, nullptr};
-            if (_qnn_htp_perfinfra) {
-                _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
-            }
-        }
-        return 0;
-    }
-
-    int set_high_performance_mode() {
-        if (nullptr == _qnn_htp_perfinfra) {
-            GGMLQNN_LOG_DEBUG("perf intra is null\n");
-            return 1;
-        }
-
-        QnnHtpPerfInfrastructure_PowerConfig_t power_config;
-        memset(&power_config, 0, sizeof(power_config));
-        power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
-        power_config.dcvsV3Config.dcvsEnable = 0;
-        power_config.dcvsV3Config.setDcvsEnable = 1;
-        power_config.dcvsV3Config.contextId = _qnn_power_configid;
-        power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
-        power_config.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False
-        power_config.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False
-        power_config.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False
-        power_config.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable
-        power_config.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False
-        // set Sleep latency parameter
-        uint32_t latencyValue = 40;
-        power_config.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec
-        // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
-        power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-        power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-        power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-        // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
-        power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-        power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-        power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-        // set power config with different performance parameters
-        const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&power_config, nullptr};
-
-        _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
-
-        return 0;
-    }
-
-    std::string & get_qnn_graph_name() { return _graph_name; }
-
-    bool is_rpcmem_initialized() {
-        return _rpcmem_initialized;
-    }
-
-    void set_rpcmem_initialized(bool initialized) {
-        _rpcmem_initialized = initialized;
-    }
-
-    size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
-    size_t get_rpcmem_usage() { return _rpcmem_usage; }
-
-    int32_t rpcmem_to_fd(void * buf);
-
-    int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor);
-    Qnn_MemHandle_t  register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type);
-
-    void unregister_rpcmem();
-    void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
-
-    void * alloc_rpcmem(size_t bytes, size_t alignment);
-    void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle);
-
-    void free_rpcmem(void * buf);
-    void free_rpcmem();
-
-    bool is_rpcmem_allocated(void * buf);
-
-    bool is_rpcmem_registered(Qnn_MemHandle_t handle) {
-        return _qnn_mem_set.count(handle) != 0U;
-    }
-
-    bool enable_qnn_rpc() {
-        return _enable_qnn_rpc;
-    }
-
-    void probe_device_meminfo() {
-        size_t candidate_size   = 0;
-        uint8_t * rpc_buffer    = nullptr;
-        const int SIZE_IN_MB    = (1 << 20);
-        size_t probe_slots[]    = {1024, 1536, 2048 - 48, 2048};
-        size_t probe_counts     = sizeof(probe_slots) / sizeof(size_t);
-        for (size_t idx = 0; idx < probe_counts; idx++) {
-            rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem_internal(probe_slots[idx] * SIZE_IN_MB, 4));
-            if (nullptr == rpc_buffer) {
-                GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
-                break;
-            } else {
-                candidate_size = probe_slots[idx];
-                free_rpcmem(rpc_buffer);
-                rpc_buffer = nullptr;
-            }
-        }
-        if (candidate_size > _rpcmem_capacity)
-            _rpcmem_capacity = candidate_size;
-
-        free_rpcmem();
-        _rpcmem_usage = 0;
-        GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
-    }
-
-public:
-    std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>> _qnn_graph_map;
-
-private:
-    int load_system();
-
-    int unload_system();
-
-    int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config);
-
-    int unload_backend();
-
-    void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) {
-        _qnn_raw_interface = raw_interface;
-    }
-
-    void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) {
-        _qnn_raw_system_interface = raw_interface;
-    }
-
-    void * alloc_rpcmem_internal(size_t bytes, size_t alignment);
-
-private:
-    static constexpr const int _required_num_providers = 1;
-
-private:
-    std::string     _lib_path;
-    std::string     _backend_name;
-    std::string     _model_name; // name of prebuilt QNN model, might be used in the future
-    BackendIdType   _backend_id;
-
-    bool _debug_tensor                      = false; // flag to indicate if requested graph is to be run in debug mode
-    bool _do_node_validations               = true;  // flag to indicate whether all add_node calls need to be validated
-    QnnLog_Level_t _qnn_log_level           = QNN_LOG_LEVEL_DEBUG;
-
-    ggml_qnn_profile_level _profile_level   = ggml_qnn_profile_level::profile_detail;
-
-    void * _system_lib_handle               = nullptr;
-
-    Qnn_GraphHandle_t _qnn_graph_handle     = nullptr;
-
-    Qnn_LogHandle_t _qnn_log_handle         = nullptr;
-
-    Qnn_ProfileHandle_t _qnn_profile_handle = nullptr;
-
-    Qnn_DeviceHandle_t _qnn_device_handle   = nullptr;
-
-    Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
-
-    Qnn_ContextHandle_t _qnn_context_handle = nullptr;
-
-    QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
-
-    QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr;
-    uint32_t _qnn_power_configid            = 1;
-    uint32_t _qnn_rpc_pollingtime           = 9999; // 0-10000 us for high performing
-
-    qnn_interface _qnn_interface;
-    QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
-    QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
-
-    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
-    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
-
-    static std::mutex _init_mutex;
-    static std::unordered_map<BackendIdType, void *> _loaded_lib_handle;
-    static std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
-    static std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
-
-    std::atomic_bool _rpcmem_initialized{false};
-    pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
-    pfn_rpc_mem_free _pfn_rpc_mem_free;
-    pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
-    pfn_rpc_mem_init  _pfn_rpc_mem_init;
-    pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
-    std::unordered_map<void *, void *> _rpcmem_store_map;
-    std::unordered_map<void *, size_t> _rpcmem_usage_map;
-    size_t                             _rpcmem_usage    = 0;   // mempool usage in Mbytes
-    size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
-
-    std::string _graph_name;
-    QNNBackend _device_id;
-    void * _rpc_lib_handle      = nullptr;
-    bool       _enable_qnn_rpc  = false; //TODO:unknown issue with QNN RPC feature
-
-    DISABLE_COPY(qnn_instance);
-    DISABLE_MOVE(qnn_instance);
-};
-
 std::mutex qnn_instance::_init_mutex;
 std::unordered_map<qnn_instance::BackendIdType, void *> qnn_instance::_loaded_lib_handle;
 std::unordered_map<std::string, qnn_instance::BackendIdType> qnn_instance::_lib_path_to_backend_id;
@@ -2637,7 +1992,7 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend devi
     if (error != QNN_SUCCESS) {
         GGMLQNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s",
                       ggml_backend_qnn_get_devname(device), graph_name.c_str(),
-                      qnn_get_error_string(error));
+                      ggmlqnn_get_error_string(error));
         return error;
     }
 
@@ -2697,7 +2052,103 @@ int qnn_instance::finalize_qnn_graph() {
     return 0;
 }
 
-static uint8_t * create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) {
+int qnn_instance::init_htp_perfinfra() {
+    QnnDevice_Infrastructure_t device_infra = nullptr;
+    int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra);
+    if (error != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to get qnn device infra\n");
+        return 1;
+    }
+
+    QnnHtpDevice_Infrastructure_t * htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
+    QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra;
+    uint32_t power_configid = 1;
+    uint32_t device_id = 0;
+    uint32_t core_id = 0;
+    htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid);
+    _qnn_htp_perfinfra = htp_perfinfra;
+    _qnn_power_configid = power_configid;
+
+    return 0;
+}
+
+int qnn_instance::set_rpc_polling() {
+    if (_qnn_rpc_pollingtime > 0) {
+        QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingtime;
+        memset(&rpc_pollingtime, 0, sizeof(rpc_pollingtime));
+        rpc_pollingtime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
+        rpc_pollingtime.rpcPollingTimeConfig = _qnn_rpc_pollingtime;
+        const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&rpc_pollingtime, nullptr};
+        if (_qnn_htp_perfinfra) {
+            _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
+        }
+    }
+    return 0;
+}
+
+int qnn_instance::set_high_performance_mode() {
+    if (nullptr == _qnn_htp_perfinfra) {
+        GGMLQNN_LOG_DEBUG("perf intra is null\n");
+        return 1;
+    }
+
+    QnnHtpPerfInfrastructure_PowerConfig_t power_config;
+    memset(&power_config, 0, sizeof(power_config));
+    power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
+    power_config.dcvsV3Config.dcvsEnable = 0;
+    power_config.dcvsV3Config.setDcvsEnable = 1;
+    power_config.dcvsV3Config.contextId = _qnn_power_configid;
+    power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
+    power_config.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False
+    power_config.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False
+    power_config.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False
+    power_config.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable
+    power_config.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False
+    // set Sleep latency parameter
+    uint32_t latencyValue = 40;
+    power_config.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec
+    // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
+    power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
+    power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    // set power config with different performance parameters
+    const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&power_config, nullptr};
+
+    _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
+
+    return 0;
+}
+
+void qnn_instance::probe_device_meminfo() {
+    size_t candidate_size   = 0;
+    uint8_t * rpc_buffer    = nullptr;
+    const int SIZE_IN_MB    = (1 << 20);
+    size_t probe_slots[]    = {1024, 1536, 2048 - 48, 2048};
+    size_t probe_counts     = sizeof(probe_slots) / sizeof(size_t);
+    for (size_t idx = 0; idx < probe_counts; idx++) {
+        rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem_internal(probe_slots[idx] * SIZE_IN_MB, 4));
+        if (nullptr == rpc_buffer) {
+            GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
+            break;
+        } else {
+            candidate_size = probe_slots[idx];
+            free_rpcmem(rpc_buffer);
+            rpc_buffer = nullptr;
+        }
+    }
+    if (candidate_size > _rpcmem_capacity)
+        _rpcmem_capacity = candidate_size;
+
+    free_rpcmem();
+    _rpcmem_usage = 0;
+    GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
+}
+
+uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) {
     if (nullptr == instance || nullptr == ggml_tensor || nullptr == qnn_tensor) {
         GGMLQNN_LOG_WARN("invalid params\n");
         return nullptr;
@@ -2716,7 +2167,7 @@ static uint8_t * create_rpc_buffer(qnn_instance * instance, const ggml_tensor *
     return qnn_rpcbuffer;
 }
 
-static void print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+void ggmlqnn_print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     //skip sanity check of params
     if (nullptr != func_name && nullptr != ctx) {
         GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
@@ -2742,14 +2193,14 @@ static void dump_op_info(const struct ggml_tensor * tensor) {
     struct ggml_tensor       * src1 = tensor->src[1];
     struct ggml_tensor       * dst  = const_cast<ggml_tensor *>(tensor);
     GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), ggml_type_name(tensor->type));
-    print_tensors_info(nullptr, nullptr, src0, src1, dst);
+    ggmlqnn_print_tensors_info(nullptr, nullptr, src0, src1, dst);
 }
 
 // =================================================================================================
 //  section-6: implementation of ggml-qnn backend
 // =================================================================================================
 //TODO: refine this function as it is a performance hotspot/bottleneck function
-static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
+static bool ggml_qnn_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * tensor) {
     if (tensor->op == GGML_OP_NONE) {
         return true;
     }
@@ -2761,7 +2212,7 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
         return false;
     }
 
-    //TODO: support other op
+    //TODO: add other op here
     bool supported_op = ((tensor->op == GGML_OP_ADD)
                          || (tensor->op == GGML_OP_MUL_MAT)
                          || (tensor->op == GGML_OP_MUL)
@@ -2782,8 +2233,8 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
     const int64_t ne0   = tensor->ne[0];
     const int64_t ne1   = tensor->ne[1];
 
-    const uint32_t src0_rank = ggml_get_tensor_rank(src0);
-    const uint32_t src1_rank = ggml_get_tensor_rank(src1);
+    const uint32_t src0_rank = ggml_n_dims(src0);
+    const uint32_t src1_rank = ggml_n_dims(src1);
 
     if (tensor->op == GGML_OP_ADD) {
         //dump_op_info(tensor);
@@ -2802,19 +2253,22 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
             return false;
         if (src0_rank < 2) // QNN's limitation, make QNN SDK happy
             return false;
-        if (src0_rank > 3) //TODO: 4D matrix
+        if (4 == src0_rank) //TODO: 4D matrix mulmat
             return false;
         if ((src1->ne[2] != src0->ne[2]) || (src1->ne[3] != src0->ne[3])) // make QNN SDK happy
             return false;
 
-        if (2 != src0_rank) { //TODO: quantize src0 for 3D & 4D matrix
-            return (src0->type == GGML_TYPE_F32)
-                   && (src1->type == GGML_TYPE_F32)
-                   && (tensor->type == GGML_TYPE_F32);
-        } else {
-            return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q6_K)
+        if (ctx->device == QNN_BACKEND_NPU)
+            if (2 == src0_rank)
+                return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16
+                    || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
+                    || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
+                   ) && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
+           else
+                return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
+        else
+            return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))
                    && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
-        }
     }
 
     if (tensor->op == GGML_OP_MUL) {
@@ -2826,556 +2280,135 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
                 && (tensor->type == src1->type);
     }
 
-    return  (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
-            && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
-            && (src0->type == src1->type) && (src0->type == tensor->type);
-}
-
-/*
- * provide a general skeleton to offload ggml op to QNN backend: a single node contains 2 input
- * tensor and 1 output tensor
-*/
-static void ggml_qnn_general_node(ggml_backend_t backend, ggml_tensor * op) {
-    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
-    enum ggml_status result                     = GGML_STATUS_SUCCESS;
-    bool graph_initialized                      = false;
-    qnn_instance * instance                     = nullptr;
-    ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *)backend->context;
-    Qnn_GraphHandle_t graph_handle              = nullptr;
-    Qnn_Tensor_t * p_tensor0                    = nullptr;
-    Qnn_Tensor_t * p_tensor1                    = nullptr;
-    Qnn_Tensor_t * p_tensor2                    = nullptr;
-    Qnn_Param_t qnn_params[]                    = {};
-    const ggml_tensor * src0                    = op->src[0];
-    const ggml_tensor * src1                    = op->src[1];
-    ggml_tensor * dst                           = op;
-
-    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-    instance                                    = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-
-    size_t qnn_op_index = get_qnn_op_index(op);
-    GGML_ASSERT(qnn_op_index < std::size(k_op_caps));
-    const char * qnn_op_name = k_op_caps[qnn_op_index].qnn_op_name;
-    std::string ggml_op_name_string = std::string("ggml_") + ggml_op_name(op->op);
-    const char * ggml_op_name = ggml_op_name_string.c_str();
-
-    qnn_perf op_perf                            = qnn_perf(ggml_op_name);
-    op_perf.start();
-
-    std::string graph_name;
-    get_graph_key_from_op(op, graph_name);
-    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
-        graph_initialized = true;
-        qnn_res_t & graph_item = instance->_qnn_graph_map[graph_name];
-        graph_handle = std::get<0>(graph_item);
-        qnn_tensors_t & tensor = std::get<1>(graph_item);
-        p_tensor0     = tensor[0];
-        p_tensor1     = tensor[1];
-        p_tensor2     = tensor[2];
-    } else {
-        p_tensor0 = ggml_qnn_create_compute_tensor(src0);
-        p_tensor1 = ggml_qnn_create_compute_tensor(src1);
-        p_tensor2 = ggml_qnn_create_compute_tensor(dst);
-    }
-#if GGMLQNN_PRINT_OP_ADD_LOG
-    print_tensors_info(__func__, ctx, src0, src1, dst);
-#endif
-
-    //ensure QNN tensor has correct tensor type
-    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
-
-    //save the original dimensions of qnn tensors
-    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
-    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
-    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
-
-    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
-
-    if (!graph_initialized) {
-        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
-        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
-            return;
-        }
-        graph_handle = instance->get_qnn_graph_handle();
-
-        if (enable_npu_rpc) {
-            QNN_VER_PTR(*p_tensor0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {.data=nullptr, .dataSize=0};
-
-            QNN_VER_PTR(*p_tensor1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {.data=nullptr, .dataSize=0};
-
-            QNN_VER_PTR(*p_tensor2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*p_tensor2)->clientBuf = {.data=nullptr, .dataSize=0};
-        }
-
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
-
-        if (enable_npu_rpc) {
-            uint8_t * qnn_rpcbuffer_0 = create_rpc_buffer(instance, src0, p_tensor0, true);
-            uint8_t * qnn_rpcbuffer_1 = create_rpc_buffer(instance, src1, p_tensor1, true);
-            uint8_t * qnn_rpcbuffer_2 = create_rpc_buffer(instance, dst, p_tensor2, false);
-            if (nullptr == qnn_rpcbuffer_0 || nullptr == qnn_rpcbuffer_1 || nullptr == qnn_rpcbuffer_2) {
-                GGMLQNN_LOG_INFO("create rpc buffer failure\n");
-                //TODO: potential memory leak although it shouldn't happen
-                return;
-            }
-        } else {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
-        }
-
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *p_tensor2
-        };
-        Qnn_OpConfig_t op_config = {
-                QNN_OPCONFIG_VERSION_1, .v1 = {
-                        ggml_op_name,
-                        QNN_OP_PACKAGE_NAME_QTI_AISW,
-                        qnn_op_name,
-                        0,
-                        qnn_params,
-                        2,
-                        tensor_inputs,
-                        1,
-                        tensor_outputs
-                }
-        };
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
-        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            tensor_inputs, 2,
-                                                            tensor_outputs, 1,
-                                                            nullptr, nullptr));
-
-        if (enable_npu_rpc) {
-            uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer = %p\n", qnn_rpcbuffer);
-            if (nullptr != qnn_rpcbuffer) {
-                memcpy(dst->data, qnn_rpcbuffer, ggml_nbytes(dst));
-            }
-        }
-
-        qnn_tensors_t ggml_op_add_tensors;
-        ggml_op_add_tensors.reserve(3);
-        ggml_op_add_tensors.push_back(p_tensor0);
-        ggml_op_add_tensors.push_back(p_tensor1);
-        ggml_op_add_tensors.push_back(p_tensor2);
-
-        auto  graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
-        instance->_qnn_graph_map[graph_name] = graph_item;
-    } else {
-        Qnn_DataType_t src0_qnn_type    = QNN_DATATYPE_FLOAT_32;
-        Qnn_DataType_t src1_qnn_type    = QNN_DATATYPE_FLOAT_32;
-        Qnn_DataType_t dst_qnn_type     = QNN_DATATYPE_FLOAT_32;
-
-        src0_qnn_type                   = qnn_datatype_from_ggml_datatype(src0->type);
-        src1_qnn_type                   = qnn_datatype_from_ggml_datatype(src1->type);
-        dst_qnn_type                    = qnn_datatype_from_ggml_datatype(dst->type);
-
-        uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-                                         (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
-        uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-                                         (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
-        uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-                                         (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
-
-        QNN_VER_PTR(*p_tensor0)->dimensions  = dimensions_input_0;
-        QNN_VER_PTR(*p_tensor0)->rank        = ggml_get_tensor_rank(src0);
-        QNN_VER_PTR(*p_tensor0)->dataType    = src0_qnn_type;
-
-        QNN_VER_PTR(*p_tensor1)->dimensions  = dimensions_input_1;
-        QNN_VER_PTR(*p_tensor1)->rank        = ggml_get_tensor_rank(src1);
-        QNN_VER_PTR(*p_tensor1)->dataType    = src1_qnn_type;
-
-        QNN_VER_PTR(*p_tensor2)->dimensions  = dimensions_output;
-        QNN_VER_PTR(*p_tensor2)->rank        = ggml_get_tensor_rank(dst);
-        QNN_VER_PTR(*p_tensor2)->dataType    = dst_qnn_type;
-
-        if (enable_npu_rpc) {
-            //TODO: NPU RPC feature will failed with test-backend-ops
-            uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
-            if (nullptr != qnn_buffer_0) {
-                memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
-            }
-
-            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor1)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
-            if (nullptr != qnn_buffer_1) {
-                memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
-            }
-        } else {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
-        }
-
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *p_tensor2
-        };
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            tensor_inputs, 2,
-                                                            tensor_outputs, 1,
-                                                            nullptr, nullptr));
-
-        if (enable_npu_rpc) {
-            //TODO:NPU RPC feature will failed with test-backend-ops
-            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
-            if (nullptr != qnn_buffer_2) {
-                memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
-            }
-        }
-    }
-
-    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
-    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
-    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
-    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
-
-#if GGMLQNN_PRINT_OP_ADD_LOG
-    op_perf.info();
-#endif
+    return false;
 }
 
-/*
- * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
- *        using the QNN backend. this function performs matrix multiplication of the input tensor
- *        `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
- *        and stores the result in the destination tensor `dst`.
- *
- * @param backend the context which got through (ggml_backend_qnn_context *)backend->context for the
- *                QNN backend operations.
- * @param op      the destination tensor where the result of the matrix multiplication will be stored.
- *
- * @note the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
- *       than ggml_qnn_general_node. so it's a standalone function. accordingly, this is another
- *       typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute
- *       time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds
- *       of MUL_MAT to compute:
- *       mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
- *       mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
- *       mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, Q6_K...)
- *                        and src1 is F32, src0 -> f32 in src0', then src0' * src1
-*/
-static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
-    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
-    bool graph_initialized                      = false;
-    qnn_perf op_perf                            = qnn_perf("ggml_qnn_mul_mat");
-    qnn_instance * instance                     = nullptr;
-    ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *)backend->context;
-    Qnn_GraphHandle_t graph_handle              = nullptr;
-    Qnn_Tensor_t * p_tensor0                    = nullptr;
-    Qnn_Tensor_t * p_tensor1                    = nullptr;
-    Qnn_Tensor_t * p_tensor2                    = nullptr;
-    Qnn_Tensor_t * p_param_tensor               = nullptr;
-    Qnn_Tensor_t * p_tensor2_transpose          = nullptr;
-    const ggml_tensor * src0                    = op->src[0];
-    const ggml_tensor * src1                    = op->src[1];
-    ggml_tensor       * dst                     = op;
-
-    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-    instance                                    = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-    op_perf.start();
-
-    const enum ggml_type type                   = src0->type;
-    const uint32_t src0_rank                    = ggml_get_tensor_rank(src0);
-    const uint32_t src1_rank                    = ggml_get_tensor_rank(src1);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne12);
-    GGML_ASSERT(ne3 == ne13);
-    GGML_ASSERT(nb00 == ggml_type_size(type));
-    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
-
-    GGML_ASSERT(src0_rank == src1_rank);
-    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
-
-    // broadcast factors
-    const int64_t r2            = ne12  / ne02;
-    const int64_t r3            = ne13  / ne03;
-    const int64_t ne_plane      = ne01  * ne00;
-    const size_t  desired_size  = ((GGML_TYPE_F32 == type) ? 0 : ne03 * ne02 * ne_plane * sizeof(float));
-    if (ctx->work_size < desired_size) {
-        ctx->work_data.reset(new char[desired_size]);
-        ctx->work_size = desired_size;
-    }
-    void * wdata = ctx->work_data.get();
-    // convert src0 to float
-    if (type != GGML_TYPE_F32) {
-        const auto * type_traits = ggml_get_type_traits(type);
-        ggml_to_float_t const to_float = type_traits->to_float;
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                const void  * x      = (char *)src0->data + i02 * nb02     + i03 * nb03;
-                float * const wplane = (float *)wdata     + i02 * ne_plane + i03 * ne02 * ne_plane;
-
-                const int min_cols_per_thread = 4096;
-                const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1);
-                const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1);
-                for (int i = 1; i < n_threads; i++) {
-                    const int64_t start = i * ne01 / n_threads;
-                    const int64_t end   = (i + 1) * ne01 / n_threads;
-                    if (start < end) {
-                        ctx->tasks.push_back(std::async(std::launch::async, [=]() {
-                            for (int64_t i01 = start; i01 < end; i01++) {
-                                to_float((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
-                            }
-                        }));
-                    }
-                }
-                {
-                    // reuse the current thread for the first task
-                    const int64_t start = 0;
-                    const int64_t end   = ne01 / n_threads;
-                    for (int64_t i01 = start; i01 < end; i01++) {
-                        to_float((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
-                    }
-                }
-            }
-        }
-
-        // wait for all tasks to finish
-        for (auto & task : ctx->tasks) {
-            task.get();
-        }
-        ctx->tasks.clear();
-    }
-
-    std::string graph_name;
-    get_graph_key_from_op(op, graph_name);
-    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
-        graph_initialized       = true;
-        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
-        graph_handle            = std::get<0>(graph_item);
-        qnn_tensors_t & tensors = std::get<1>(graph_item);
-        p_tensor0               = tensors[0];
-        p_tensor1               = tensors[1];
-        p_tensor2               = tensors[2];
-        p_param_tensor          = tensors[3];
-        p_tensor2_transpose     = tensors[4];
-    } else {
-        p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor1 = GQCGT(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor2 = GQCGT(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-    }
-    print_tensors_info(__func__, ctx, src0, src1, dst);
-
-    //ensure QNN tensor has correct tensor type
-    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
-
-    //save the original dimensions of qnn tensors
-    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
-    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
-    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
-
-    if (!graph_initialized) {
-        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
-        /*
-         there are two key-points in properly handling how to offload mulmat to the QNN backend in ggml-qnn
-         1. transpose
-            a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from:
-            struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
-            which like this:
-            +---+---+
-            | 0 | 1 |
-            +---+---+
-            | 2 | 3 |
-            +---+---+
-            | 4 | 5 |
-            +---+---+
-            with
-                ne[0] = 2
-                ne[1] = 3
-            there are different dimension order between ggml tensor and qnn tensor
-
-          2. QNN's MatMul can only support input tensors with rank >= 2
-
-             in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
-             operation when offloading mulmat to QNN backend. this concise implementation will handle
-             transpose in func ggml_qnn_create_general_tensor()
-        */
-        //step-1: create qnn graph
-        error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
-                                              graph_name.c_str(), nullptr, &graph_handle);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
-            return;
-        }
-        //step-2: create param tensor for mulmat of 2d/3d/4d matrix
-        const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
-                {0},
-                {1, 0},
-                {0, 2, 1},
-                {0, 1, 3, 2},
-        };
-        uint32_t param_tensor_dims[1]   = {src0_rank};
-        p_param_tensor = GQCGT(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, param_tensor_dims, (void *)(param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
-
-        //step-3: create compute tensor from ggml tensor
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
-        if (type != GGML_TYPE_F32) {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
-        } else {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-        }
-        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
-
-        //step-4: create a transpose tensor
-        p_tensor2_transpose = GQCGT(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0, true);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2_transpose));
-
-        //step-5: compose qnn graph: add mat_mul node
-        Qnn_Param_t out_0_params[] = {
-                {QNN_PARAMTYPE_SCALAR,
-                 QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
-                        .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}
-                }
-        };
-
-        Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0, *p_tensor1};
-        Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
-#if 0 //leave here for easily understand code, can be removed in the future
-        Qnn_OpConfig_t out_0 = {
-                QNN_OPCONFIG_VERSION_1, .v1 =
-                        {"ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
-                         1,
-                         out_0_params,
-                         2,
-                         out_0_inputs,
-                         1,
-                         out_0_outputs}
-        };
-#else
-        Qnn_OpConfig_t out_0 = create_op_config("ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
-                                                out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
-#endif
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
-
-        //step-5: compose qnn graph: add transpose node
-        Qnn_Param_t out_trans1_0_params[] = {
-                {(Qnn_ParamType_t) 1,
-                 "perm", .tensorParam = *p_param_tensor
-                }
-        };
-        Qnn_Tensor_t out_trans1_0_inputs[]  = {*p_tensor2_transpose};
-        Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
-#if 0 //leave here for easily understand code, can be removed in the future
-        Qnn_OpConfig_t out_trans1_0 = {
-                QNN_OPCONFIG_VERSION_1,
-                .v1 =  {"ggmlqnn_mulmat_transpose_opconfig",
-                        QNN_OP_PACKAGE_NAME_QTI_AISW,
-                        QNN_OP_TRANSPOSE, 1,
-                        out_trans1_0_params,
-                        1,
-                        out_trans1_0_inputs,
-                        1,
-                        out_trans1_0_outputs}
-        };
-#else
-        Qnn_OpConfig_t out_trans1_0 = create_op_config("ggmlqnn_mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
-                                                       out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
-#endif
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
-
-        //step-6: finalize qnn graph and execute qnn graph
-        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-        Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0, *p_tensor1};
-        Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            input_tensors_0, 2,
-                                                            output_tensors_0, 1,
-                                                            nullptr, nullptr));
-
-        qnn_tensors_t ggml_op_mulmat_tensors;
-        ggml_op_mulmat_tensors.reserve(5);
-        ggml_op_mulmat_tensors.push_back(p_tensor0);
-        ggml_op_mulmat_tensors.push_back(p_tensor1);
-        ggml_op_mulmat_tensors.push_back(p_tensor2);
-        ggml_op_mulmat_tensors.push_back(p_param_tensor);
-        ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
-        auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
-        instance->_qnn_graph_map[graph_name] = graph_item;
-    } else {
-        if (type != GGML_TYPE_F32) {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
-        } else {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
-        }
-        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};
-
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *p_tensor2
-        };
-        // this is the second technical approach or another pipeline of "how to utilize the Hexagon
-        // NPU maximally" through QNN SDK, details could be found at
-        // https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            tensor_inputs, 2,
-                                                            tensor_outputs, 1,
-                                                            nullptr, nullptr));
-    }
-
-    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
-    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
-    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
-    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
-    op_perf.info();
-}
-
-static bool ggml_qnn_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor) {
+static bool ggml_qnn_compute_forward(ggml_backend_t backend, struct ggml_tensor * dst) {
     ggmlqnn_op_func_t func                = nullptr;
+    ggml_backend_qnn_context * ctx        = (ggml_backend_qnn_context *)backend->context;
 
-    switch (tensor->op) {
+    switch (dst->op) {
+        case GGML_OP_REPEAT:
+            ggml_qnn_repeat(ctx, dst);
+            break;
+        case GGML_OP_GET_ROWS:
+            ggml_qnn_get_rows(ctx, dst);
+            break;
+        case GGML_OP_DUP:
+            ggml_qnn_dup(ctx, dst);
+            break;
         case GGML_OP_ADD:
             func = ggml_qnn_general_node;
             break;
-
-        case GGML_OP_MUL_MAT:
-            func = ggml_qnn_mul_mat;
+        case GGML_OP_ACC:
+            ggml_qnn_acc(ctx, dst);
             break;
-
         case GGML_OP_MUL:
             func = ggml_qnn_general_node;
             break;
-
+        case GGML_OP_DIV:
+            ggml_qnn_div(ctx, dst);
+            break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(dst)) {
+                case GGML_UNARY_OP_GELU:
+                    break;
+                case GGML_UNARY_OP_SILU:
+                    break;
+                case GGML_UNARY_OP_GELU_QUICK:
+                    break;
+                case GGML_UNARY_OP_TANH:
+                    break;
+                case GGML_UNARY_OP_RELU:
+                    break;
+                case GGML_UNARY_OP_HARDSIGMOID:
+                    break;
+                case GGML_UNARY_OP_HARDSWISH:
+                    break;
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_NORM:
+            ggml_qnn_norm(ctx, dst);
+            break;
+        case GGML_OP_GROUP_NORM:
+            ggml_qnn_group_norm(ctx, dst);
+            break;
+        case GGML_OP_CONCAT:
+            ggml_qnn_concat(ctx, dst);
+            break;
+        case GGML_OP_UPSCALE:
+            ggml_qnn_upsample_nearest2d(ctx, dst);
+            break;
+        case GGML_OP_PAD:
+            ggml_qnn_pad(ctx, dst);
+            break;
+        case GGML_OP_ARANGE:
+            ggml_qnn_arange(ctx, dst);
+            break;
+        case GGML_OP_TIMESTEP_EMBEDDING:
+            ggml_qnn_timestep_embedding(ctx, dst);
+            break;
+        case GGML_OP_LEAKY_RELU:
+            ggml_qnn_leaky_relu(ctx, dst);
+            break;
+        case GGML_OP_RMS_NORM:
+            ggml_qnn_rms_norm(ctx, dst);
+            break;
+        case GGML_OP_MUL_MAT:
+            ggml_qnn_mul_mat(ctx, dst);
+            break;
+        case GGML_OP_MUL_MAT_ID:
+            return false;
+        case GGML_OP_SCALE:
+            ggml_qnn_scale(ctx, dst);
+            break;
+        case GGML_OP_SQR:
+            ggml_qnn_sqr(ctx, dst);
+            break;
+        case GGML_OP_CLAMP:
+            ggml_qnn_clamp(ctx, dst);
+            break;
+        case GGML_OP_CPY:
+            ggml_qnn_cpy(ctx, dst);
+            break;
+        case GGML_OP_CONT:
+            ggml_qnn_dup(ctx, dst);
+            break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            break;
+        case GGML_OP_DIAG_MASK_INF:
+            ggml_qnn_diag_mask(ctx, dst, -INFINITY);
+            break;
+        case GGML_OP_SOFT_MAX:
+            ggml_qnn_softmax(ctx, dst);
+            break;
+        case GGML_OP_ROPE:
+            ggml_qnn_rope(ctx, dst);
+            break;
+        case GGML_OP_IM2COL:
+            ggml_qnn_im2col(ctx, dst);
+            break;
+        case GGML_OP_POOL_2D:
+            ggml_qnn_pool2d(ctx, dst);
+            break;
+        case GGML_OP_SUM_ROWS:
+            ggml_qnn_sum_rows(ctx, dst);
+            break;
+        case GGML_OP_ARGSORT:
+            ggml_qnn_argsort(ctx, dst);
+            break;
         default:
             return false;
     }
 
     if (nullptr != func)
-        func(backend, tensor);
+        func(ctx, dst);
 
     return true;
 }
@@ -3695,10 +2728,9 @@ static ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_host_ptr(ggml_b
     GGML_UNUSED(max_tensor_size);
 }
 
-
 static bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
     ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) dev->context;
-    return (ggml_qnn_can_handle_op(op));
+    return (ggml_qnn_can_handle_op(ctx,op));
 }
 
 static bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
deleted file mode 100755
index 3d239510b8d63..0000000000000
--- a/scripts/build-run-android.sh
+++ /dev/null
@@ -1,282 +0,0 @@
-#!/bin/bash
-
-set -e
-
-PWD=`pwd`
-ANDROID_PLATFORM=android-34
-ANDROID_NDK=${PWD}/android-ndk-r26c
-REMOTE_PATH=/data/local/tmp/
-GGUF_MODEL_NAME=/sdcard/deepseek-r1-distill-qwen-1.5b-q4_0.gguf
-GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
-
-#QNN SDK could be found at:
-#https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
-#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
-QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
-QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.31.0.250130/
-
-#default is QNN NPU
-qnnbackend=2
-
-function dump_vars()
-{
-    echo -e "ANDROID_NDK:          ${ANDROID_NDK}"
-    echo -e "QNN_SDK_PATH:         ${QNN_SDK_PATH}"
-}
-
-
-function show_pwd()
-{
-    echo -e "current working path:$(pwd)\n"
-}
-
-
-function check_qnn_sdk()
-{
-    if [ ! -d ${QNN_SDK_PATH} ]; then
-        echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from ${QNN_SDK_URL}...\n"
-        exit 1
-    fi
-}
-
-
-function check_and_download_ndk()
-{
-    is_android_ndk_exist=1
-
-    if [ ! -d ${ANDROID_NDK} ]; then
-        is_android_ndk_exist=0
-    fi
-
-    if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then
-        is_android_ndk_exist=0
-    fi
-
-    if [ ${is_android_ndk_exist} -eq 0 ]; then
-
-        if [ ! -f android-ndk-r26c-linux.zip ]; then
-            wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip  https://dl.google.com/android/repository/android-ndk-r26c-linux.zip
-        fi
-
-        unzip android-ndk-r26c-linux.zip
-
-        if [ $? -ne 0 ]; then
-            printf "failed to download android ndk to %s \n" "${ANDROID_NDK}"
-            exit 1
-        fi
-
-        printf "android ndk saved to ${ANDROID_NDK} \n\n"
-    else
-        printf "android ndk already exist:${ANDROID_NDK} \n\n"
-    fi
-}
-
-
-function build_arm64
-{
-    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_USE_QNN=ON -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
-    cd out/android
-    make -j16
-    show_pwd
-
-    cd -
-}
-
-
-function remove_temp_dir()
-{
-    if [ -d out ]; then
-        echo "remove out directory in `pwd`"
-        rm -rf out
-    fi
-}
-
-
-function check_qnn_libs()
-{
-    #reuse the cached qnn libs on Android phone
-    adb shell ls ${REMOTE_PATH}/libQnnCpu.so
-    if [ $? -eq 0 ]; then
-        printf "QNN libs already exist on Android phone\n"
-    else
-        update_qnn_libs
-    fi
-}
-
-
-function update_qnn_libs()
-{
-    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so              ${REMOTE_PATH}/
-    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so                 ${REMOTE_PATH}/
-    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so                 ${REMOTE_PATH}/
-
-    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so                 ${REMOTE_PATH}/
-    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
-    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so          ${REMOTE_PATH}/
-    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so          ${REMOTE_PATH}/
-    adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so     ${REMOTE_PATH}/
-}
-
-
-function build_ggml_qnn()
-{
-    show_pwd
-    check_and_download_ndk
-    check_qnn_sdk
-    dump_vars
-    remove_temp_dir
-    build_arm64
-}
-
-
-function prepare_run_on_phone()
-{
-    if [ $# != 1 ]; then
-        print "invalid param"
-        return
-    fi
-    program=$1
-
-    check_qnn_libs
-
-    if [ -f ./out/android/bin/libggml-qnn.so ]; then
-        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
-    fi
-    adb push ./out/android/bin/${program} ${REMOTE_PATH}/
-    adb shell chmod +x ${REMOTE_PATH}/${program}
-}
-
-function run_llamacli()
-{
-    prepare_run_on_phone llama-cli
-
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
-
-}
-
-
-function run_llamabench()
-{
-    prepare_run_on_phone llama-bench
-
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-bench -mg ${qnnbackend} -m ${GGUF_MODEL_NAME}"
-
-}
-
-
-function run_test-backend-ops()
-{
-    prepare_run_on_phone test-backend-ops
-
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/test-backend-ops test"
-
-}
-
-function run_ut_add()
-{
-    prepare_run_on_phone ggml-qnn-ut
-
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/ggml-qnn-ut -t GGML_OP_ADD -b $qnnbackend"
-
-}
-
-function run_ut_mulmat()
-{
-    prepare_run_on_phone ggml-qnn-ut
-
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/ggml-qnn-ut -t GGML_OP_MUL_MAT -b $qnnbackend"
-
-}
-
-function run_ut_mul()
-{
-    prepare_run_on_phone ggml-qnn-ut
-
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/ggml-qnn-ut -t GGML_OP_MUL -b $qnnbackend"
-
-}
-
-
-function show_usage()
-{
-    echo "Usage:"
-    echo "  $0 build"
-    echo "  $0 updateqnnlib"
-    echo "  $0 run_testop"
-    echo "  $0 run_ut_add       0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo "  $0 run_ut_mulmat    0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo "  $0 run_ut_mul       0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo "  $0 run_llamacli     0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo "  $0 run_llamabench   0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo -e "\n\n\n"
-}
-
-
-show_pwd
-
-check_qnn_sdk
-
-if [ $# == 0 ]; then
-    show_usage
-    exit 1
-elif [ $# == 1 ]; then
-    if [ "$1" == "-h" ]; then
-        show_usage
-        exit 1
-    elif [ "$1" == "help" ]; then
-        show_usage
-        exit 1
-    elif [ "$1" == "build" ]; then
-        build_ggml_qnn
-        exit 0
-
-    elif [ "$1" == "run_testop" ]; then
-        run_test-backend-ops
-        exit 0
-
-    elif [ "$1" == "updateqnnlib" ]; then
-        update_qnn_libs
-        exit 0
-    else
-        show_usage
-        exit 1
-    fi
-elif [ $# == 2 ]; then
-    qnnbackend=$2
-    if [ ${qnnbackend} -gt 3 ]; then
-        show_usage
-        exit 1
-    fi
-
-    if [ "$1" == "run_llamacli" ]; then
-        run_llamacli
-        exit 0
-    elif [ "$1" == "run_llamabench" ]; then
-        run_llamabench
-        exit 0
-    elif [ "$1" == "run_ut_add" ]; then
-        run_ut_add
-        exit 0
-    elif [ "$1" == "run_ut_mulmat" ]; then
-        run_ut_mulmat
-        exit 0
-    elif [ "$1" == "run_ut_mul" ]; then
-        run_ut_mul
-        exit 0
-    fi
-else
-    show_usage
-    exit 1
-fi

From 85b8570a02c0a585df13d872188c1398588c225c Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 27 Feb 2025 16:51:25 +0800
Subject: [PATCH 089/200] ggml-qnn: enable release build with necessary logs to
 make reviewers happy

---
 ggml/src/ggml-qnn/ggml-qnn-impl.h  |   4 +
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp |   4 +
 ggml/src/ggml-qnn/ggml-qnn.cpp     |   9 +-
 scripts/build-run-android.sh       | 240 +++++++++++++++++++++++++++++
 4 files changed, 253 insertions(+), 4 deletions(-)
 create mode 100755 scripts/build-run-android.sh

diff --git a/ggml/src/ggml-qnn/ggml-qnn-impl.h b/ggml/src/ggml-qnn/ggml-qnn-impl.h
index 974755955f9d2..a4e00e0b7bbd7 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-impl.h
+++ b/ggml/src/ggml-qnn/ggml-qnn-impl.h
@@ -94,11 +94,15 @@ class  qnn_instance;
 struct ggml_backend_qnn_context;
 void   ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
 
+<<<<<<< HEAD
 <<<<<<< HEAD
 #if 0//def NDEBUG
 =======
 #ifdef NDEBUG
 >>>>>>> ggml-qnn: refine source code structure to make code more clearly
+=======
+#if 0//def NDEBUG
+>>>>>>> ggml-qnn: enable release build with necessary logs to make reviewers happy
 #define GGMLQNN_DEBUG                           0
 #define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
 #define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 6614a1b90f6fd..b6a8f020bbeb7 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -111,6 +111,7 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         p_tensor1 = ggmlqnn_create_compute_tensor(src1);
         p_tensor2 = ggmlqnn_create_compute_tensor(dst);
     }
+<<<<<<< HEAD
 <<<<<<< HEAD
     //ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
 =======
@@ -118,6 +119,9 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     print_tensors_info(__func__, ctx, src0, src1, dst);
 #endif
 >>>>>>> ggml-qnn: refine source code structure to make code more clearly
+=======
+    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
+>>>>>>> ggml-qnn: enable release build with necessary logs to make reviewers happy
 
     //ensure QNN tensor has correct tensor type
     QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index ff1a8a0f39506..5276001a8523b 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -2248,7 +2248,7 @@ static bool ggml_qnn_can_handle_op(const ggml_backend_qnn_context * ctx, const s
     }
 
     if (tensor->op == GGML_OP_MUL_MAT) {
-        dump_op_info(tensor);
+        //dump_op_info(tensor);
         if (src0_rank != src1_rank) // make QNN SDK happy
             return false;
         if (src0_rank < 2) // QNN's limitation, make QNN SDK happy
@@ -2260,15 +2260,16 @@ static bool ggml_qnn_can_handle_op(const ggml_backend_qnn_context * ctx, const s
 
         if (ctx->device == QNN_BACKEND_NPU)
             if (2 == src0_rank)
-                return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16
+                return (src0->type == GGML_TYPE_F32
                     || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
                     || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
                    ) && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
            else
                 return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
         else
-            return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type))
-                   && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
+            return (src0->type == GGML_TYPE_F32   || src0->type == GGML_TYPE_Q4_0
+                    || src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K)
+                    && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
     }
 
     if (tensor->op == GGML_OP_MUL) {
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
new file mode 100755
index 0000000000000..1a5f362fe2083
--- /dev/null
+++ b/scripts/build-run-android.sh
@@ -0,0 +1,240 @@
+#!/bin/bash
+
+set -e
+
+PWD=`pwd`
+ANDROID_PLATFORM=android-34
+ANDROID_NDK=${PWD}/android-ndk-r26c
+REMOTE_PATH=/data/local/tmp/
+GGUF_MODEL_NAME=/sdcard/deepseek-r1-distill-qwen-1.5b-q4_0.gguf
+GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
+
+#QNN SDK could be found at:
+#https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
+QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.31.0.250130/
+
+#default is QNN NPU
+qnnbackend=2
+
+function dump_vars()
+{
+    echo -e "ANDROID_NDK:          ${ANDROID_NDK}"
+    echo -e "QNN_SDK_PATH:         ${QNN_SDK_PATH}"
+}
+
+
+function show_pwd()
+{
+    echo -e "current working path:$(pwd)\n"
+}
+
+
+function check_qnn_sdk()
+{
+    if [ ! -d ${QNN_SDK_PATH} ]; then
+        echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from ${QNN_SDK_URL}...\n"
+        exit 1
+    fi
+}
+
+
+function check_and_download_ndk()
+{
+    is_android_ndk_exist=1
+
+    if [ ! -d ${ANDROID_NDK} ]; then
+        is_android_ndk_exist=0
+    fi
+
+    if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then
+        is_android_ndk_exist=0
+    fi
+
+    if [ ${is_android_ndk_exist} -eq 0 ]; then
+
+        if [ ! -f android-ndk-r26c-linux.zip ]; then
+            wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip  https://dl.google.com/android/repository/android-ndk-r26c-linux.zip
+        fi
+
+        unzip android-ndk-r26c-linux.zip
+
+        if [ $? -ne 0 ]; then
+            printf "failed to download android ndk to %s \n" "${ANDROID_NDK}"
+            exit 1
+        fi
+
+        printf "android ndk saved to ${ANDROID_NDK} \n\n"
+    else
+        printf "android ndk already exist:${ANDROID_NDK} \n\n"
+    fi
+}
+
+
+function build_arm64
+{
+    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_USE_QNN=ON -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
+    cd out/android
+    make -j16
+    show_pwd
+
+    cd -
+}
+
+
+function remove_temp_dir()
+{
+    if [ -d out ]; then
+        echo "remove out directory in `pwd`"
+        rm -rf out
+    fi
+}
+
+
+function check_qnn_libs()
+{
+    #reuse the cached qnn libs on Android phone
+    adb shell ls ${REMOTE_PATH}/libQnnCpu.so
+    if [ $? -eq 0 ]; then
+        printf "QNN libs already exist on Android phone\n"
+    else
+        update_qnn_libs
+    fi
+}
+
+
+function update_qnn_libs()
+{
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so              ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so                 ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so                 ${REMOTE_PATH}/
+
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so                 ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so          ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so          ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so     ${REMOTE_PATH}/
+}
+
+
+function build_ggml_qnn()
+{
+    show_pwd
+    check_and_download_ndk
+    check_qnn_sdk
+    dump_vars
+    remove_temp_dir
+    build_arm64
+}
+
+
+function run_llamacli()
+{
+    check_qnn_libs
+
+    if [ -f ./out/android/bin/libggml-qnn.so ]; then
+        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
+    fi
+    adb push ./out/android/bin/llama-cli ${REMOTE_PATH}/
+    adb shell chmod +x ${REMOTE_PATH}/llama-cli
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
+
+}
+
+
+function run_llamabench()
+{
+    check_qnn_libs
+
+    if [ -f ./out/android/bin/libggml-qnn.so ]; then
+        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
+    fi
+    adb push ./out/android/bin/llama-bench ${REMOTE_PATH}/
+    adb shell chmod +x ${REMOTE_PATH}/llama-bench
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-bench -mg ${qnnbackend} -m ${GGUF_MODEL_NAME}"
+
+}
+
+
+function run_test-backend-ops()
+{
+    check_qnn_libs
+
+    if [ -f ./out/android/bin/libggml-qnn.so ]; then
+        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
+    fi
+    adb push ./out/android/bin/test-backend-ops ${REMOTE_PATH}/
+    adb shell chmod +x ${REMOTE_PATH}/test-backend-ops
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test"
+
+}
+
+
+function show_usage()
+{
+    echo "Usage:"
+    echo "  $0 build"
+    echo "  $0 updateqnnlib"
+    echo "  $0 run_testop"
+    echo "  $0 run_llamacli     0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_llamabench   0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo -e "\n\n\n"
+}
+
+
+show_pwd
+
+check_qnn_sdk
+
+if [ $# == 0 ]; then
+    show_usage
+    exit 1
+elif [ $# == 1 ]; then
+    if [ "$1" == "-h" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "help" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "build" ]; then
+        build_ggml_qnn
+        exit 0
+
+    elif [ "$1" == "run_testop" ]; then
+        run_test-backend-ops
+        exit 0
+    elif [ "$1" == "updateqnnlib" ]; then
+        update_qnn_libs
+        exit 0
+    else
+        show_usage
+        exit 1
+    fi
+elif [ $# == 2 ]; then
+    qnnbackend=$2
+    if [ ${qnnbackend} -gt 3 ]; then
+        show_usage
+        exit 1
+    fi
+
+    if [ "$1" == "run_llamacli" ]; then
+        run_llamacli
+        exit 0
+    elif [ "$1" == "run_llamabench" ]; then
+        run_llamabench
+        exit 0
+    fi
+else
+    show_usage
+    exit 1
+fi

From d1ba7c89d088374bf933c6b4242b8fa35d192734 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 27 Feb 2025 17:14:47 +0800
Subject: [PATCH 090/200] ggml-qnn: enable all quantize type with 2d mulmat

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 986 -----------------------------
 ggml/src/ggml-qnn/ggml-qnn.cpp     |   3 +-
 2 files changed, 1 insertion(+), 988 deletions(-)
 delete mode 100644 ggml/src/ggml-qnn/ggml-qnn-ops.cpp

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
deleted file mode 100644
index b6a8f020bbeb7..0000000000000
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ /dev/null
@@ -1,986 +0,0 @@
-/*
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#include "ggml-impl.h"
-#include "ggml-common.h"
-#include "ggml-qnn-ops.h"
-
-<<<<<<< HEAD
-static inline uint32_t ggmlqnn_get_tensor_data_size(const ggml_tensor * tensor) {
-    /*
-    size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
-    size_t n_dims = ggml_get_tensor_rank(tensor);
-    for (int i = 1; i < n_dims; i++) {
-        data_size *= tensor->ne[i];
-    }
-
-    return data_size;
-    */
-    return ggml_nbytes(tensor);
-}
-
-static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                             const ggml_tensor * src1, ggml_tensor * dst) {
-    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
-        GGMLQNN_LOG_WARN("invalid params\n");
-        return false;
-    }
-
-    qnn_instance * instance = ctx->instance;
-    if (nullptr == instance) {
-        GGMLQNN_LOG_WARN("invalid params\n");
-        return false;
-    }
-
-    return true;
-}
-
-=======
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-#define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                          \
-    do {                                                                    \
-        if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {       \
-            return;                                                         \
-        }                                                                   \
-    } while (0)
-
-/*
- * provide a general skeleton to offload ggml op to QNN backend: a single node contains 2 input
- * tensor and 1 output tensor
-*/
-void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
-    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
-    enum ggml_status result                     = GGML_STATUS_SUCCESS;
-    bool graph_initialized                      = false;
-    qnn_instance * instance                     = nullptr;
-    Qnn_GraphHandle_t graph_handle              = nullptr;
-    Qnn_Tensor_t * p_tensor0                    = nullptr;
-    Qnn_Tensor_t * p_tensor1                    = nullptr;
-    Qnn_Tensor_t * p_tensor2                    = nullptr;
-    Qnn_Param_t qnn_params[]                    = {};
-    const ggml_tensor * src0                    = op->src[0];
-    const ggml_tensor * src1                    = op->src[1];
-    ggml_tensor * dst                           = op;
-
-    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-    instance                                    = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-    size_t qnn_op_index                         = ggmlqnn_get_op_index(op);
-    GGML_ASSERT(qnn_op_index < ggmlqnn_get_opcaps_size());
-<<<<<<< HEAD
-    const char * qnn_op_name                    = ggmlqnn_k_op_caps[qnn_op_index].qnn_op_name;
-=======
-    const char * qnn_op_name                    = k_op_caps[qnn_op_index].qnn_op_name;
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-    std::string ggml_op_name_string             = std::string("ggml_") + ggml_op_name(op->op);
-    const char * ggml_op_name                   = ggml_op_name_string.c_str();
-
-    qnn_perf op_perf                            = qnn_perf(ggml_op_name);
-    op_perf.start();
-
-    std::string graph_name;
-    ggmlqnn_get_graphkey_from_op(op, graph_name);
-    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
-        graph_initialized = true;
-        qnn_res_t & graph_item = instance->_qnn_graph_map[graph_name];
-        graph_handle = std::get<0>(graph_item);
-        qnn_tensors_t & tensor = std::get<1>(graph_item);
-        p_tensor0     = tensor[0];
-        p_tensor1     = tensor[1];
-        p_tensor2     = tensor[2];
-    } else {
-        p_tensor0 = ggmlqnn_create_compute_tensor(src0);
-        p_tensor1 = ggmlqnn_create_compute_tensor(src1);
-        p_tensor2 = ggmlqnn_create_compute_tensor(dst);
-    }
-<<<<<<< HEAD
-<<<<<<< HEAD
-    //ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
-=======
-#if GGMLQNN_PRINT_OP_ADD_LOG
-    print_tensors_info(__func__, ctx, src0, src1, dst);
-#endif
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-=======
-    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
->>>>>>> ggml-qnn: enable release build with necessary logs to make reviewers happy
-
-    //ensure QNN tensor has correct tensor type
-    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
-
-    //save the original dimensions of qnn tensors
-    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
-    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
-    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
-
-    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
-
-    if (!graph_initialized) {
-        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
-        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
-            return;
-        }
-        graph_handle = instance->get_qnn_graph_handle();
-
-        if (enable_npu_rpc) {
-            QNN_VER_PTR(*p_tensor0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {.data=nullptr, .dataSize=0};
-
-            QNN_VER_PTR(*p_tensor1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {.data=nullptr, .dataSize=0};
-
-            QNN_VER_PTR(*p_tensor2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*p_tensor2)->clientBuf = {.data=nullptr, .dataSize=0};
-        }
-
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
-
-        if (enable_npu_rpc) {
-            uint8_t * qnn_rpcbuffer_0 = ggmlqnn_create_rpc_buffer(instance, src0, p_tensor0, true);
-            uint8_t * qnn_rpcbuffer_1 = ggmlqnn_create_rpc_buffer(instance, src1, p_tensor1, true);
-            uint8_t * qnn_rpcbuffer_2 = ggmlqnn_create_rpc_buffer(instance, dst, p_tensor2, false);
-            if (nullptr == qnn_rpcbuffer_0 || nullptr == qnn_rpcbuffer_1 || nullptr == qnn_rpcbuffer_2) {
-                GGMLQNN_LOG_INFO("create rpc buffer failure\n");
-                //TODO: potential memory leak although it shouldn't happen
-                return;
-            }
-        } else {
-<<<<<<< HEAD
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
-=======
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
-            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-        }
-
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *p_tensor2
-        };
-        Qnn_OpConfig_t op_config = {
-                QNN_OPCONFIG_VERSION_1, .v1 = {
-                        ggml_op_name,
-                        QNN_OP_PACKAGE_NAME_QTI_AISW,
-                        qnn_op_name,
-                        0,
-                        qnn_params,
-                        2,
-                        tensor_inputs,
-                        1,
-                        tensor_outputs
-                }
-        };
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
-        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            tensor_inputs, 2,
-                                                            tensor_outputs, 1,
-                                                            nullptr, nullptr));
-
-        if (enable_npu_rpc) {
-            uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer = %p\n", qnn_rpcbuffer);
-            if (nullptr != qnn_rpcbuffer) {
-                memcpy(dst->data, qnn_rpcbuffer, ggml_nbytes(dst));
-            }
-        }
-
-        qnn_tensors_t ggml_op_add_tensors;
-        ggml_op_add_tensors.reserve(3);
-        ggml_op_add_tensors.push_back(p_tensor0);
-        ggml_op_add_tensors.push_back(p_tensor1);
-        ggml_op_add_tensors.push_back(p_tensor2);
-
-        auto  graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
-        instance->_qnn_graph_map[graph_name] = graph_item;
-    } else {
-        Qnn_DataType_t src0_qnn_type    = QNN_DATATYPE_FLOAT_32;
-        Qnn_DataType_t src1_qnn_type    = QNN_DATATYPE_FLOAT_32;
-        Qnn_DataType_t dst_qnn_type     = QNN_DATATYPE_FLOAT_32;
-
-        src0_qnn_type                   = ggmlqnn_datatype_from_ggml_datatype(src0->type);
-        src1_qnn_type                   = ggmlqnn_datatype_from_ggml_datatype(src1->type);
-        dst_qnn_type                    = ggmlqnn_datatype_from_ggml_datatype(dst->type);
-
-        uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-                                         (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
-        uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-                                         (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
-        uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-                                         (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
-
-        QNN_VER_PTR(*p_tensor0)->dimensions  = dimensions_input_0;
-        QNN_VER_PTR(*p_tensor0)->rank        = ggml_n_dims(src0);
-        QNN_VER_PTR(*p_tensor0)->dataType    = src0_qnn_type;
-
-        QNN_VER_PTR(*p_tensor1)->dimensions  = dimensions_input_1;
-        QNN_VER_PTR(*p_tensor1)->rank        = ggml_n_dims(src1);
-        QNN_VER_PTR(*p_tensor1)->dataType    = src1_qnn_type;
-
-        QNN_VER_PTR(*p_tensor2)->dimensions  = dimensions_output;
-        QNN_VER_PTR(*p_tensor2)->rank        = ggml_n_dims(dst);
-        QNN_VER_PTR(*p_tensor2)->dataType    = dst_qnn_type;
-
-        if (enable_npu_rpc) {
-            //TODO: NPU RPC feature will failed with test-backend-ops
-            uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
-            if (nullptr != qnn_buffer_0) {
-                memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
-            }
-
-            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor1)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
-            if (nullptr != qnn_buffer_1) {
-                memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
-            }
-        } else {
-<<<<<<< HEAD
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
-=======
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
-            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-        }
-
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *p_tensor2
-        };
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            tensor_inputs, 2,
-                                                            tensor_outputs, 1,
-                                                            nullptr, nullptr));
-
-        if (enable_npu_rpc) {
-            //TODO:NPU RPC feature will failed with test-backend-ops
-            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
-            if (nullptr != qnn_buffer_2) {
-                memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
-            }
-        }
-    }
-
-    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
-    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
-    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
-    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
-
-#if GGMLQNN_PRINT_OP_ADD_LOG
-    op_perf.info();
-#endif
-}
-
-/*
-<<<<<<< HEAD
- * this function is AI-assisted code from Grok 3 for purpose of offload 4d matrix mulmat to QNN backend
- * UT in ggml-qnn-ut.cpp passed:
- * ./scripts/build-run-android.sh run_ut_mulmat 0
- * ./scripts/build-run-android.sh run_ut_mulmat 1
- * ./scripts/build-run-android.sh run_ut_mulmat 2
- *
- * the logic of ggml_qnn_mul_mat_4d is similar to ggml_qnn_mul_mat but much more complicated
- * than ggml_qnn_mul_mat, so it's a standalone function.
- * it will be combined with ggml_qnn_mul_mat in the future
- */
-static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    bool graph_initialized = false;
-    qnn_perf op_perf = qnn_perf("ggml_qnn_mul_mat_4d");
-    qnn_instance *instance = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
-
-    const ggml_tensor *src0 = op->src[0];
-    const ggml_tensor *src1 = op->src[1];
-    ggml_tensor *dst = op;
-
-    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-    GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4);
-    op_perf.start();
-
-    std::string graph_name;
-    ggmlqnn_get_graphkey_from_op(op, graph_name);
-    GGMLQNN_LOG_DEBUG("graph name %s\n", graph_name.c_str());
-
-    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
-
-    Qnn_GraphHandle_t graph_handle = nullptr;
-    Qnn_Tensor_t *p_tensor0 = nullptr;
-    Qnn_Tensor_t *p_reshape0_out = nullptr;
-    Qnn_Tensor_t *p_tile0_out = nullptr;
-    Qnn_Tensor_t *p_tensor1 = nullptr;
-    Qnn_Tensor_t *p_permute1_out = nullptr;
-    Qnn_Tensor_t *p_reshape1_out = nullptr;
-    Qnn_Tensor_t *p_matmul_out = nullptr;
-    Qnn_Tensor_t *p_reshape2_out = nullptr;
-
-    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
-        graph_initialized = true;
-        qnn_res_t &graph_item = instance->_qnn_graph_map[graph_name];
-        graph_handle = std::get<0>(graph_item);
-        qnn_tensors_t &tensors = std::get<1>(graph_item);
-        p_tensor0 = tensors[0];
-        p_reshape0_out = tensors[1];
-        p_tile0_out = tensors[2];
-        p_tensor1 = tensors[3];
-        p_permute1_out = tensors[4];
-        p_reshape1_out = tensors[5];
-        p_matmul_out = tensors[6];
-        p_reshape2_out = tensors[7];
-    } else {
-        CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
-                                                           graph_name.c_str(), NULL, &graph_handle));
-
-        // Define dimensions
-        uint32_t K = src0->ne[0];               // Inner dimension
-        uint32_t M = src0->ne[1];               // Rows of src0
-        uint32_t N = src1->ne[1];               // Columns of src1
-        uint32_t B0 = src0->ne[2] * src0->ne[3]; // src0 batch
-        uint32_t B1 = src1->ne[2] * src1->ne[3]; // src1 batch (drives output)
-
-        // Validate K only
-        GGML_ASSERT(src0->ne[0] == src1->ne[0]); // K must match
-
-        // src0: [K, M, H0, B0] -> QNN: [B0, H0, M, K]
-        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
-        p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
-                          src0_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
-
-        // Reshape src0 to [B0, M, K]
-        uint32_t reshape0_out_dims[] = {B0, M, K};
-        p_reshape0_out = GQCGT(nullptr, "reshape0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                               reshape0_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape0_out));
-        Qnn_Tensor_t reshape0_inputs[] = {*p_tensor0};
-        Qnn_Tensor_t reshape0_outputs[] = {*p_reshape0_out};
-        Qnn_OpConfig_t reshape0_op = ggmlqnn_create_op_config("reshape0", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                              QNN_OP_RESHAPE, nullptr, 0,
-                                                              reshape0_inputs, 1, reshape0_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape0_op));
-
-        // Tile src0 to match B1: [B0, M, K] -> [B1, M, K]
-        uint32_t tile0_out_dims[] = {B1, M, K};
-        p_tile0_out = GQCGT(nullptr, "tile0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                            tile0_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tile0_out));
-        uint32_t tile_multiples[] = {B1 / B0, 1, 1};
-        uint32_t tile_dims[] = {3};
-        Qnn_Tensor_t *p_tile_multiples = GQCGT(nullptr, "tile_multiples", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
-                                               tile_dims, tile_multiples, sizeof(tile_multiples));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tile_multiples));
-        Qnn_Param_t tile_params[] = {{QNN_PARAMTYPE_TENSOR, "multiples", .tensorParam = *p_tile_multiples}};
-        Qnn_Tensor_t tile0_inputs[] = {*p_reshape0_out};
-        Qnn_Tensor_t tile0_outputs[] = {*p_tile0_out};
-        Qnn_OpConfig_t tile0_op = ggmlqnn_create_op_config("tile0", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                           QNN_OP_TILE, tile_params, 1,
-                                                           tile0_inputs, 1, tile0_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, tile0_op));
-
-        // src1: [N, K, H1, B1] -> QNN: [B1, H1, N, K]
-        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
-        p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
-                          src1_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
-
-        // Permute src1 to [B1, H1, K, N]
-        uint32_t perm_data[] = {0, 1, 3, 2};
-        uint32_t perm_dims[] = {4};
-        Qnn_Tensor_t *p_perm = GQCGT(nullptr, "perm", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
-                                     perm_dims, perm_data, sizeof(perm_data));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_perm));
-        uint32_t permute1_out_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1])};
-        p_permute1_out = GQCGT(nullptr, "permute1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
-                               permute1_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_permute1_out));
-        Qnn_Param_t permute1_params[] = {{QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_perm}};
-        Qnn_Tensor_t permute1_inputs[] = {*p_tensor1};
-        Qnn_Tensor_t permute1_outputs[] = {*p_permute1_out};
-        Qnn_OpConfig_t permute1_op = ggmlqnn_create_op_config("permute1", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                              QNN_OP_TRANSPOSE, permute1_params, 1,
-                                                              permute1_inputs, 1, permute1_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, permute1_op));
-
-        // Reshape src1 to [B1, K, N]
-        uint32_t reshape1_out_dims[] = {B1, K, N};
-        p_reshape1_out = GQCGT(nullptr, "reshape1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                               reshape1_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape1_out));
-        Qnn_Tensor_t reshape1_inputs[] = {*p_permute1_out};
-        Qnn_Tensor_t reshape1_outputs[] = {*p_reshape1_out};
-        Qnn_OpConfig_t reshape1_op = ggmlqnn_create_op_config("reshape1", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                              QNN_OP_RESHAPE, nullptr, 0,
-                                                              reshape1_inputs, 1, reshape1_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape1_op));
-
-        // MatMul: [B1, M, K] x [B1, K, N] -> [B1, M, N]
-        uint32_t matmul_out_dims[] = {B1, M, N};
-        p_matmul_out = GQCGT(nullptr, "matmul_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                             matmul_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_matmul_out));
-        Qnn_Tensor_t matmul_inputs[] = {*p_tile0_out, *p_reshape1_out};
-        Qnn_Tensor_t matmul_outputs[] = {*p_matmul_out};
-        Qnn_OpConfig_t matmul_op = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                            QNN_OP_MAT_MUL, nullptr, 0,
-                                                            matmul_inputs, 2, matmul_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
-
-        // Output: [N, M, H1, B1] -> QNN: [B1, H1, M, N]
-        uint32_t reshape2_out_dims[] = {static_cast<uint32_t>(dst->ne[3]), static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[0])};
-        p_reshape2_out = GQCGT(dst, "output", QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
-                               reshape2_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape2_out));
-        Qnn_Tensor_t reshape2_inputs[] = {*p_matmul_out};
-        Qnn_Tensor_t reshape2_outputs[] = {*p_reshape2_out};
-        Qnn_OpConfig_t reshape2_op = ggmlqnn_create_op_config("reshape2", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                              QNN_OP_RESHAPE, nullptr, 0,
-                                                              reshape2_inputs, 1, reshape2_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape2_op));
-
-        // Finalize
-        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
-
-        // Cache
-        qnn_tensors_t ggml_op_mulmat_tensors = {p_tensor0, p_reshape0_out, p_tile0_out, p_tensor1, p_permute1_out, p_reshape1_out, p_matmul_out, p_reshape2_out};
-        instance->_qnn_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
-    }
-
-    // Execute
-    QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
-    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
-    QNN_VER_PTR(*p_reshape2_out)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
-
-    Qnn_Tensor_t input_tensors[] = {*p_tensor0, *p_tensor1};
-    Qnn_Tensor_t output_tensors[] = {*p_reshape2_out};
-    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2,
-                                                        output_tensors, 1, NULL, NULL));
-
-#if 0
-    // Log dst for debugging
-    float *dst_data = (float *)dst->data;
-    GGMLQNN_LOG_DEBUG("dst shape: [%d, %d, %d, %d]\n", dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
-    for (int i = 0; i < dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3]; i++) {
-        GGMLQNN_LOG_DEBUG("dst[%d] = %f\n", i, dst_data[i]);
-    }
-#endif
-
-    op_perf.info();
-}
-
-/*
-=======
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
- * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
- *        using the QNN backend. this function performs matrix multiplication of the input tensor
- *        `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
- *        and stores the result in the destination tensor `dst`.
- *
- * @param backend the context which got through (ggml_backend_qnn_context *)backend->context for the
- *                QNN backend operations.
- * @param op      the destination tensor where the result of the matrix multiplication will be stored.
- *
- * @note the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
- *       than ggml_qnn_general_node. so it's a standalone function. accordingly, this is another
- *       typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute
- *       time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds
- *       of MUL_MAT to compute:
- *       mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
- *       mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
- *       mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, Q6_K...)
- *                        and src1 is F32, src0 -> f32 in src0', then src0' * src1
-*/
-void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
-    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
-    bool graph_initialized                      = false;
-    qnn_perf op_perf                            = qnn_perf("ggml_qnn_mul_mat");
-    qnn_instance * instance                     = nullptr;
-    Qnn_GraphHandle_t graph_handle              = nullptr;
-    Qnn_Tensor_t * p_tensor0                    = nullptr;
-    Qnn_Tensor_t * p_tensor1                    = nullptr;
-    Qnn_Tensor_t * p_tensor2                    = nullptr;
-    Qnn_Tensor_t * p_param_tensor               = nullptr;
-    Qnn_Tensor_t * p_tensor2_transpose          = nullptr;
-    const ggml_tensor * src0                    = op->src[0];
-    const ggml_tensor * src1                    = op->src[1];
-    ggml_tensor       * dst                     = op;
-
-    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-    instance                                    = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-    op_perf.start();
-
-    const enum ggml_type src0_type              = src0->type;
-    const uint32_t src0_rank                    = ggml_n_dims(src0);
-    const uint32_t src1_rank                    = ggml_n_dims(src1);
-    GGML_ASSERT(src0_rank == src1_rank);
-    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
-<<<<<<< HEAD
-    if (4 == src0_rank) {
-        return ggml_qnn_mul_mat_4d(ctx, op);
-    }
-=======
-    GGML_ASSERT(src0_rank != 4); //TODO: 4D matrix mulmat
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-    void * wdata                                = ggmlqnn_type_trait(ctx, op);
-    const size_t desired_size                   = ctx->desired_size;
-
-    std::string graph_name;
-    ggmlqnn_get_graphkey_from_op(op, graph_name);
-    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
-        graph_initialized       = true;
-        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
-        graph_handle            = std::get<0>(graph_item);
-        qnn_tensors_t & tensors = std::get<1>(graph_item);
-        p_tensor0               = tensors[0];
-        p_tensor1               = tensors[1];
-        p_tensor2               = tensors[2];
-        p_param_tensor          = tensors[3];
-        p_tensor2_transpose     = tensors[4];
-    } else {
-        p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor1 = GQCGT(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor2 = GQCGT(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-    }
-    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
-
-    //ensure QNN tensor has correct tensor type
-    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
-
-    //save the original dimensions of qnn tensors
-    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
-    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
-    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
-
-    if (!graph_initialized) {
-        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
-        /*
-         there are two key-points in properly handling how to offload mulmat to the QNN backend in ggml-qnn
-         1. transpose
-            a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from:
-            struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
-            which like this:
-            +---+---+
-            | 0 | 1 |
-            +---+---+
-            | 2 | 3 |
-            +---+---+
-            | 4 | 5 |
-            +---+---+
-            with
-                ne[0] = 2
-                ne[1] = 3
-            there are different dimension order between ggml tensor and qnn tensor
-
-          2. QNN's MatMul can only support input tensors with rank >= 2
-
-             in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
-             operation when offloading mulmat to QNN backend. this concise implementation will handle
-             transpose in func ggml_qnn_create_general_tensor()
-        */
-        //step-1: create qnn graph
-        error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
-                                              graph_name.c_str(), nullptr, &graph_handle);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
-            return;
-        }
-        //step-2: create param tensor for mulmat of 2d/3d/4d matrix
-        const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
-                {0},
-                {1, 0},
-                {0, 2, 1},
-                {0, 1, 3, 2},
-        };
-        uint32_t param_tensor_dims[1]   = {src0_rank};
-        p_param_tensor = GQCGT(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, param_tensor_dims, (void *)(param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
-
-        //step-3: create compute tensor from ggml tensor
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
-        if (src0_type != GGML_TYPE_F32) {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
-        } else {
-<<<<<<< HEAD
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-        }
-        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
-=======
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
-        }
-        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
-        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-
-        //step-4: create a transpose tensor
-        p_tensor2_transpose = GQCGT(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0, true);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2_transpose));
-
-        //step-5: compose qnn graph: add mat_mul node
-        Qnn_Param_t out_0_params[] = {
-                {QNN_PARAMTYPE_SCALAR,
-                 QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
-                        .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}
-                }
-        };
-
-        Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0, *p_tensor1};
-        Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
-#if 0 //leave here for easily understand code, can be removed in the future
-        Qnn_OpConfig_t out_0 = {
-                QNN_OPCONFIG_VERSION_1, .v1 =
-                        {"ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
-                         1,
-                         out_0_params,
-                         2,
-                         out_0_inputs,
-                         1,
-                         out_0_outputs}
-        };
-#else
-        Qnn_OpConfig_t out_0 = ggmlqnn_create_op_config("ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
-<<<<<<< HEAD
-                                                        out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
-=======
-                                                out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-#endif
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
-
-        //step-5: compose qnn graph: add transpose node
-        Qnn_Param_t out_trans1_0_params[] = {
-<<<<<<< HEAD
-                {QNN_PARAMTYPE_TENSOR,
-=======
-                {(Qnn_ParamType_t) 1,
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-                 "perm", .tensorParam = *p_param_tensor
-                }
-        };
-        Qnn_Tensor_t out_trans1_0_inputs[]  = {*p_tensor2_transpose};
-        Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
-#if 0 //leave here for easily understand code, can be removed in the future
-        Qnn_OpConfig_t out_trans1_0 = {
-                QNN_OPCONFIG_VERSION_1,
-                .v1 =  {"ggmlqnn_mulmat_transpose_opconfig",
-                        QNN_OP_PACKAGE_NAME_QTI_AISW,
-                        QNN_OP_TRANSPOSE, 1,
-                        out_trans1_0_params,
-                        1,
-                        out_trans1_0_inputs,
-                        1,
-                        out_trans1_0_outputs}
-        };
-#else
-        Qnn_OpConfig_t out_trans1_0 = ggmlqnn_create_op_config("ggmlqnn_mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
-<<<<<<< HEAD
-                                                               out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
-=======
-                                                       out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-#endif
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
-
-        //step-6: finalize qnn graph and execute qnn graph
-        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-        Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0, *p_tensor1};
-        Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            input_tensors_0, 2,
-                                                            output_tensors_0, 1,
-                                                            nullptr, nullptr));
-
-        qnn_tensors_t ggml_op_mulmat_tensors;
-        ggml_op_mulmat_tensors.reserve(5);
-        ggml_op_mulmat_tensors.push_back(p_tensor0);
-        ggml_op_mulmat_tensors.push_back(p_tensor1);
-        ggml_op_mulmat_tensors.push_back(p_tensor2);
-        ggml_op_mulmat_tensors.push_back(p_param_tensor);
-        ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
-        auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
-        instance->_qnn_graph_map[graph_name] = graph_item;
-    } else {
-        if (src0_type != GGML_TYPE_F32) {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
-        } else {
-<<<<<<< HEAD
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-        }
-        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
-=======
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
-        }
-        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
-        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *p_tensor2
-        };
-        // this is the second technical approach or another pipeline of "how to utilize the Hexagon
-        // NPU maximally" through QNN SDK, details could be found at
-        // https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            tensor_inputs, 2,
-                                                            tensor_outputs, 1,
-                                                            nullptr, nullptr));
-    }
-
-    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
-    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
-    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
-    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
-    op_perf.info();
-}
-<<<<<<< HEAD
-
-void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-    GGML_UNUSED(value);
-}
-
-void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-=======
-void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_add(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-static void ggml_qnn_avg_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-static void ggml_qnn_max_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value) {
-}
-
-void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-}
-
-void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    ggml_qnn_dup(ctx, dst);
-}
-
-void ggml_qnn_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-<<<<<<< HEAD
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-=======
-}
-
-void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-}
-
-void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-}
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 5276001a8523b..7704a4ad038f7 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -2267,8 +2267,7 @@ static bool ggml_qnn_can_handle_op(const ggml_backend_qnn_context * ctx, const s
            else
                 return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
         else
-            return (src0->type == GGML_TYPE_F32   || src0->type == GGML_TYPE_Q4_0
-                    || src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K)
+            return (src0->type == GGML_TYPE_F32   || ggml_is_quantized(src0->type))
                     && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
     }
 

From e3f266abf8e68aefcd345b8343b7c6173f5b4334 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Fri, 28 Feb 2025 12:23:31 +0800
Subject: [PATCH 091/200] ggml-qnn: enable log output of GGMLQNN_LOG_INFO in
 command line mode for benchmark more conveniently

---
 ggml/src/ggml-qnn/ggml-qnn-impl.h | 753 ------------------------------
 ggml/src/ggml-qnn/ggml-qnn.cpp    |  21 +-
 2 files changed, 12 insertions(+), 762 deletions(-)
 delete mode 100644 ggml/src/ggml-qnn/ggml-qnn-impl.h

diff --git a/ggml/src/ggml-qnn/ggml-qnn-impl.h b/ggml/src/ggml-qnn/ggml-qnn-impl.h
deleted file mode 100644
index a4e00e0b7bbd7..0000000000000
--- a/ggml/src/ggml-qnn/ggml-qnn-impl.h
+++ /dev/null
@@ -1,753 +0,0 @@
-/*
-* Copyright (c) 2023-2024 The ggml authors
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to
-* deal in the Software without restriction, including without limitation the
-* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-* sell copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*/
-#pragma once
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stddef.h>
-#include <inttypes.h>
-#include <math.h>
-#include <time.h>
-#if defined(__ANDROID__) || defined(__linux__)
-#include <unistd.h>
-#include <dlfcn.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/sysinfo.h>
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <vector>
-#include <thread>
-#include <mutex>
-#include <map>
-#include <set>
-#include <tuple>
-#include <queue>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <chrono>
-#include <memory>
-#include <regex>
-#include <random>
-#include <functional>
-#include <unordered_map>
-#include <condition_variable>
-#include <cassert>
-#include <unordered_set>
-#include <utility>
-#include <stdatomic.h>
-#include <future>
-#if (defined __ANDROID__) || (defined ANDROID)
-#include "android/log.h"
-#endif
-
-<<<<<<< HEAD
-#if defined(_WIN32)
-#include <wchar.h>
-=======
-#if defined(_WIN32) || defined(_MSC_VER)
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-#include <Windows.h>
-#endif
-
-#include "QnnTypes.h"
-#include "QnnCommon.h"
-#include "QnnContext.h"
-#include "QnnBackend.h"
-#include "QnnGraph.h"
-#include "QnnProperty.h"
-#include "QnnTensor.h"
-#include "QnnInterface.h"
-#include "Saver/QnnSaver.h"
-#include "System/QnnSystemInterface.h"
-#include "HTP/QnnHtpDevice.h"
-#include "HTP/QnnHtpGraph.h"
-
-#include "ggml-qnn.h"
-#include "ggml-impl.h"
-#include "ggml-backend-impl.h"
-
-class  qnn_instance;
-struct ggml_backend_qnn_context;
-void   ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
-
-<<<<<<< HEAD
-<<<<<<< HEAD
-#if 0//def NDEBUG
-=======
-#ifdef NDEBUG
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-=======
-#if 0//def NDEBUG
->>>>>>> ggml-qnn: enable release build with necessary logs to make reviewers happy
-#define GGMLQNN_DEBUG                           0
-#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
-#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
-#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
-#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            0
-#else
-#define GGMLQNN_DEBUG                           1  // for troubleshooting QNN backend
-#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
-#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
-#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
-#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            1
-#endif
-#define GGML_QNN_LOGBUF_LEN                     4096
-
-<<<<<<< HEAD
-#define GGMLQNN_LOG_ERROR(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLQNN_LOG_WARN(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLQNN_LOG_INFO(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-=======
-#define GGMLQNN_LOG_ERROR(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG,  __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLQNN_LOG_WARN(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLQNN_LOG_INFO(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-
-#if GGMLQNN_DEBUG
-#define GGMLQNN_LOG_DEBUG(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#else
-#define GGMLQNN_LOG_DEBUG(...)
-#endif
-
-#define CHECK_QNN_API(error, result)                                            \
-    do {                                                                        \
-        error = (result);                                                       \
-        if (QNN_SUCCESS != error) {                                             \
-            if (error == QNN_COMMON_ERROR_NOT_SUPPORTED) {                      \
-                GGMLQNN_LOG_WARN("WARNING: QNN feature/API not supported\n");   \
-            } else {                                                            \
-                GGMLQNN_LOG_INFO("QNN API error = %d(%s)\n", error, ggmlqnn_get_error_string(error));  \
-            }                                                                   \
-        }                                                                       \
-    } while (0)
-
-#define QNN_VER_PTR(x)                          (&((x).v1))
-#define RPCMEM_DEFAULT_FLAGS                    1
-#define RPCMEM_HEAP_ID_SYSTEM                   25
-
-#define DISABLE_COPY(class_name)                \
-    class_name(const class_name &) = delete;    \
-    void operator=(const class_name &) = delete
-
-#define DISABLE_MOVE(class_name)                \
-    class_name(class_name &&) = delete;         \
-    void operator=(class_name &&) = delete
-
-#define GQCGT                                   ggmlqnn_create_general_tensor
-
-<<<<<<< HEAD
-#if defined(_WIN32)
-#define RTLD_GLOBAL 0x100
-#define RTLD_LOCAL  0x000
-#define RTLD_LAZY   0x000
-#define RTLD_NOW    0x001
-void *              dlopen(const char * filename, int flag);
-int                 dlclose(void * handle);
-void *              dlsym(void* handle, const char* name);
-const char *        dlerror(void);
-#endif
-
-=======
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-using pfn_rpc_mem_init                          = void (*)(void);
-using pfn_rpc_mem_deinit                        = void (*)(void);
-using pfn_rpc_mem_alloc                         = void *(*)(int, uint32_t, int);
-using pfn_rpc_mem_free                          = void (*)(void *);
-using pfn_rpc_mem_to_fd                         = int (*)(void *);
-using _pfn_QnnSaver_initialize                  = decltype(QnnSaver_initialize);
-using _pfn_QnnInterface_getProviders            = decltype(QnnInterface_getProviders);
-using _pfn_QnnSystemInterface_getProviders      = decltype(QnnSystemInterface_getProviders);
-
-using qnn_res_t                                 = std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>;
-using qnn_tensors_t                             = std::vector< Qnn_Tensor_t *>;
-
-enum class ggml_qnn_profile_level {
-    profile_off     = 0,
-    profile_basic   = 1,
-    profile_detail  = 2
-};
-
-enum qcom_htp_arch {
-    NONE = 0,
-    V68 = 68,
-    V69 = 69,
-    V73 = 73,
-    V75 = 75,
-    V79 = 79,
-};
-
-enum qcom_chipset_soc_model {
-    UNKNOWN_SM = 0,
-    SM7450 = 41,  // v69, 7 Gen1
-    SM8350 = 30,  // v68, 888
-    SM8450 = 36,  // v69, SD 8 Gen 1
-    SM8475 = 42,  // v69, SD 8+ Gen 1
-    SM8550 = 43,  // v73, SD 8 Gen 2
-    SM8650 = 57,  // v75, SD 8 Gen 3
-    SM8750 = 69,  // v79, SD 8 Gen 4
-#if defined(_MSC_VER)
-    SC7280X     = 44,
-    SC8280X     = 37,
-    SC8380XP    = 60,
-#endif
-};
-
-struct qcom_socinfo {
-    uint32_t soc_model;
-    size_t htp_arch;
-    size_t vtcm_size_in_mb;
-    char soc_desc[GGML_MAX_NAME];
-};
-
-struct ggml_backend_qnn_context {
-    int device;
-    int threads;
-    char name[GGML_MAX_NAME];
-    char desc[GGML_MAX_NAME];
-    char lib[GGML_MAX_NAME];
-    qnn_instance * instance;
-    struct ggml_backend * backend;
-    QNN_INTERFACE_VER_TYPE raw_interface;
-    QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
-    struct qcom_socinfo           socinfo;
-
-    std::unique_ptr<char[]> work_data;
-    std::vector<std::future<void>> tasks;
-    size_t work_size    = 0;
-    size_t desired_size = 0;
-    int n_threads       = GGML_DEFAULT_N_THREADS;
-};
-
-struct qnn_op_caps_t {
-    const char * qnn_op_name        = nullptr;
-    const size_t input_param_count  = 0;
-    const char * qnn_param_name     = nullptr;
-};
-<<<<<<< HEAD
-extern const qnn_op_caps_t ggmlqnn_k_op_caps[];
-=======
-extern const qnn_op_caps_t k_op_caps[];
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-
-#if ENABLE_QNNBACKEND_PERF
-class qnn_perf {
-public:
-    qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
-    qnn_perf() = delete;
-    qnn_perf(const qnn_perf & ) = delete;
-    qnn_perf & operator= (const qnn_perf & ) = delete;
-
-    void start() {
-        _begin_time = ggml_time_us();
-    }
-
-    void info() {
-        _end_time = ggml_time_us();
-        _duration = (_end_time - _begin_time);
-        GGMLQNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
-    }
-
-private:
-    int64_t _begin_time = 0LL;
-    int64_t _end_time   = 0LL;
-    int64_t _duration   = 0LL;
-    std::string _perf_name;
-};
-#else
-class qnn_perf {
-public:
-<<<<<<< HEAD
-    qnn_perf(const std::string & perf_name) {
-        GGML_UNUSED(perf_name);
-    }
-=======
-    qnn_perf(const std::string & perf_name) {}
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-    qnn_perf() = delete;
-    qnn_perf(const qnn_perf & ) = delete;
-    qnn_perf & operator= (const qnn_perf & ) = delete;
-
-    void start() {}
-    void info() {}
-};
-#endif
-
-class qnn_interface {
-#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name)           \
-  template <typename... Args>                                     \
-  inline auto qnn_##F(Args... args) const {                       \
-    return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \
-        std::forward<Args>(args)...);                             \
-  }
-
-
-#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name)                  \
-  template <typename... Args>                                                \
-  inline auto qnn_##F(Args... args) const {                                  \
-    return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \
-        std::forward<Args>(args)...);                                        \
-  }
-
-    friend class qnn_instance;
-
-public:
-    qnn_interface() = default;
-
-    // QnnBackend
-<<<<<<< HEAD
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion)
-
-    // QnnDevice
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo)
-
-    // QnnContext
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree)
-
-    // QnnGraph
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve)
-
-    // QnnLog
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel)
-
-    // QnnProfile
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree)
-
-    // QnnMem
-    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister)
-
-    // QnnProperty
-    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability)
-
-    // QnnTensor
-    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor)
-
-    // QnnSystem
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate)
-
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo)
-
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree)
-=======
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion);
-
-    // QnnDevice
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo);
-
-    // QnnContext
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree);
-
-    // QnnGraph
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve);
-
-    // QnnLog
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel);
-
-    // QnnProfile
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree);
-
-    // QnnMem
-    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister);
-
-    // QnnProperty
-    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability);
-
-    // QnnTensor
-    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor);
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor);
-
-    // QnnSystem
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate);
-
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo);
-
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree);
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-
-    void set_qnn_interface(const QnnInterface_t * qnn_interface) {
-        _qnn_interface = qnn_interface;
-    }
-
-    void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) {
-        _qnn_sys_interface = qnn_sys_interface;
-    }
-
-    uint32_t get_backend_id() const {
-        return _qnn_interface->backendId;
-    }
-
-    bool is_loaded() const {
-        return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr));
-    }
-
-private:
-<<<<<<< HEAD
-    const QnnInterface_t * _qnn_interface           = nullptr;
-
-    const QnnSystemInterface_t * _qnn_sys_interface = nullptr;
-=======
-    const QnnInterface_t *_qnn_interface = nullptr;
-
-    const QnnSystemInterface_t *_qnn_sys_interface = nullptr;
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-};
-
-class qnn_instance {
-public:
-    using BackendIdType = decltype(QnnInterface_t{}.backendId);
-
-    explicit qnn_instance(const std::string & lib_path, const std::string & backend_name,
-                          const std::string & model_name) :
-            _lib_path(std::move(lib_path)),
-            _backend_name(std::move(backend_name)),
-<<<<<<< HEAD
-            _model_name(std::move(model_name)) {}
-=======
-            _model_name(std::move(model_name)) {};
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-
-    ~qnn_instance() {
-    }
-
-    int qnn_init(const QnnSaver_Config_t ** saver_config);
-
-    int qnn_finalize();
-
-    const qnn_interface & get_qnn_interface() {
-        if (!_qnn_interface.is_loaded()) {
-            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
-        }
-        return _qnn_interface;
-    }
-
-    const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() {
-        if (!_qnn_interface.is_loaded()) {
-            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
-        }
-        return _qnn_raw_interface;
-    }
-
-    const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() {
-        if (!_qnn_interface.is_loaded()) {
-            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
-        }
-        return _qnn_raw_system_interface;
-    }
-
-<<<<<<< HEAD
-    Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
-
-    Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
-
-    Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
-
-    Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
-
-    Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
-
-    QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
-
-    Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
-=======
-    const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
-
-    const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
-
-    const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
-
-    const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
-
-    const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
-
-    const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
-
-    const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-
-    int init_qnn_graph(const char * graph_name,
-                       bool debug,
-                       uint8_t do_node_validation = 1,
-                       const QnnGraph_Config_t ** graph_configs = nullptr
-    );
-    int init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
-
-    int finalize_qnn_graph();
-
-    bool is_valid_graph() const { return _qnn_graph_handle != nullptr; }
-
-    int init_htp_perfinfra();
-
-    int set_rpc_polling();
-
-    int set_high_performance_mode();
-
-    std::string & get_qnn_graph_name() { return _graph_name; }
-
-    bool is_rpcmem_initialized() {
-        return _rpcmem_initialized;
-    }
-
-    void set_rpcmem_initialized(bool initialized) {
-        _rpcmem_initialized = initialized;
-    }
-
-    size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
-    size_t get_rpcmem_usage() { return _rpcmem_usage; }
-
-    int32_t rpcmem_to_fd(void * buf);
-
-    int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor);
-    Qnn_MemHandle_t  register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type);
-
-    void unregister_rpcmem();
-    void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
-
-    void * alloc_rpcmem(size_t bytes, size_t alignment);
-    void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle);
-
-    void free_rpcmem(void * buf);
-    void free_rpcmem();
-
-    bool is_rpcmem_allocated(void * buf);
-
-    bool is_rpcmem_registered(Qnn_MemHandle_t handle) {
-        return _qnn_mem_set.count(handle) != 0U;
-    }
-
-    bool enable_qnn_rpc() {
-        return _enable_qnn_rpc;
-    }
-
-public:
-    std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>> _qnn_graph_map;
-
-private:
-    int load_system();
-
-    int unload_system();
-
-    int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config);
-
-    int unload_backend();
-
-    void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) {
-        _qnn_raw_interface = raw_interface;
-    }
-
-    void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) {
-        _qnn_raw_system_interface = raw_interface;
-    }
-
-    void * alloc_rpcmem_internal(size_t bytes, size_t alignment);
-
-    void probe_device_meminfo();
-
-private:
-    static constexpr const int _required_num_providers = 1;
-
-private:
-    std::string     _lib_path;
-    std::string     _backend_name;
-    std::string     _model_name; // name of prebuilt QNN model, might be used in the future
-    BackendIdType   _backend_id;
-
-    bool _debug_tensor                      = false; // flag to indicate if requested graph is to be run in debug mode
-    bool _do_node_validations               = true;  // flag to indicate whether all add_node calls need to be validated
-    QnnLog_Level_t _qnn_log_level           = QNN_LOG_LEVEL_DEBUG;
-
-    ggml_qnn_profile_level _profile_level   = ggml_qnn_profile_level::profile_detail;
-
-    void * _system_lib_handle               = nullptr;
-
-    Qnn_GraphHandle_t _qnn_graph_handle     = nullptr;
-
-    Qnn_LogHandle_t _qnn_log_handle         = nullptr;
-
-    Qnn_ProfileHandle_t _qnn_profile_handle = nullptr;
-
-    Qnn_DeviceHandle_t _qnn_device_handle   = nullptr;
-
-    Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
-
-    Qnn_ContextHandle_t _qnn_context_handle = nullptr;
-
-    QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
-
-    QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr;
-    uint32_t _qnn_power_configid            = 1;
-    uint32_t _qnn_rpc_pollingtime           = 9999; // 0-10000 us for high performing
-
-    qnn_interface _qnn_interface;
-    QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
-    QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
-
-    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
-    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
-
-    static std::mutex _init_mutex;
-    static std::unordered_map<BackendIdType, void *> _loaded_lib_handle;
-    static std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
-    static std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
-
-    std::atomic_bool _rpcmem_initialized{false};
-    pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
-    pfn_rpc_mem_free _pfn_rpc_mem_free;
-    pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
-    pfn_rpc_mem_init  _pfn_rpc_mem_init;
-    pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
-    std::unordered_map<void *, void *> _rpcmem_store_map;
-    std::unordered_map<void *, size_t> _rpcmem_usage_map;
-    size_t                             _rpcmem_usage    = 0;   // mempool usage in Mbytes
-    size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
-
-    std::string _graph_name;
-    QNNBackend _device_id;
-    void * _rpc_lib_handle      = nullptr;
-    bool       _enable_qnn_rpc  = false; //TODO:unknown issue with QNN RPC feature
-
-    DISABLE_COPY(qnn_instance);
-    DISABLE_MOVE(qnn_instance);
-};
-
-size_t         ggmlqnn_get_opcaps_size(void);
-size_t         ggmlqnn_get_op_index(const ggml_tensor * tensor);
-Qnn_Tensor_t * ggmlqnn_create_compute_tensor(const ggml_tensor * tensor);
-const char   * ggmlqnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code);
-Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype);
-void         * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op);
-void           ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & output);
-<<<<<<< HEAD
-=======
-bool           ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
-uint8_t      * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata);
-void           ggmlqnn_print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-
-Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
-                                Qnn_Param_t * params, uint32_t num_params,
-                                Qnn_Tensor_t * inputs, uint32_t num_inputs,
-                                Qnn_Tensor_t * outputs, uint32_t num_outputs);
-Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
-                                Qnn_TensorType_t qnn_tensor_type,
-                                Qnn_DataType_t qnn_data_type,
-                                uint32_t rank, uint32_t * dims,
-                                void * data, uint32_t data_size,
-                                bool b_transpose = false);
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 7704a4ad038f7..2818a5e3e10d2 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -62,6 +62,9 @@ void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char *
 #if (defined __ANDROID__) || (defined ANDROID)
             //for Android application(standard APP or command line tool)
             __android_log_print(ANDROID_LOG_INFO, "ggml-qnn", "%s\n", s_ggmlqnn_log_internal_buf);
+            if (GGML_LOG_LEVEL_INFO == level) {
+                printf("%s\n", s_ggmlqnn_log_internal_buf);
+            }
 #else
             //for Snapdragon based WoA(Windows on ARM) device or Linux
             printf("%s\n", s_ggmlqnn_log_internal_buf);
@@ -1038,25 +1041,25 @@ void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     void * wdata = ctx->work_data.get();
     // convert src0 to float
     if (src0_type != GGML_TYPE_F32) {
-        const auto *type_traits = ggml_get_type_traits(src0_type);
-        ggml_to_float_t const to_float = type_traits->to_float;
+        const auto * type_traits        = ggml_get_type_traits(src0_type);
+        ggml_to_float_t const to_float  = type_traits->to_float;
 
         for (int64_t i03 = 0; i03 < ne03; i03++) {
             for (int64_t i02 = 0; i02 < ne02; i02++) {
-                const void *x = (char *) src0->data + i02 * nb02 + i03 * nb03;
-                float *const wplane = (float *) wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
+                const void * x          = (char *)src0->data + i02 * nb02 + i03 * nb03;
+                float * const wplane    = (float *)wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
 
                 const int min_cols_per_thread = 4096;
-                const int min_rows_per_thread = std::max((int) (min_cols_per_thread / ne00), 1);
+                const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1);
                 const int n_threads = std::max(
-                        std::min(ctx->n_threads, (int) (ne01 / min_rows_per_thread)), 1);
+                        std::min(ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1);
                 for (int i = 1; i < n_threads; i++) {
                     const int64_t start = i * ne01 / n_threads;
-                    const int64_t end = (i + 1) * ne01 / n_threads;
+                    const int64_t end   = (i + 1) * ne01 / n_threads;
                     if (start < end) {
                         ctx->tasks.push_back(std::async(std::launch::async, [=]() {
                             for (int64_t i01 = start; i01 < end; i01++) {
-                                to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
+                                to_float((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
                             }
                         }));
                     }
@@ -1996,7 +1999,7 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend devi
         return error;
     }
 
-    GGMLQNN_LOG_INFO("[%s]create graph %s succeed", ggml_backend_qnn_get_devname(device), graph_name.c_str());
+    GGMLQNN_LOG_DEBUG("[%s]create graph %s succeed", ggml_backend_qnn_get_devname(device), graph_name.c_str());
     _qnn_graph_handle = graph_handle;
     return QNN_SUCCESS;
 }

From 4291439d3a478bd07182684f1f35e79e238fa3bc Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Fri, 28 Feb 2025 22:34:46 +0800
Subject: [PATCH 092/200] ggml-qnn: Windows port --- step2

---
 ggml/src/ggml-qnn/ggml-qnn-impl.h | 609 ++++++++++++++++++++++++++++++
 ggml/src/ggml-qnn/ggml-qnn.cpp    |  99 +++--
 2 files changed, 670 insertions(+), 38 deletions(-)
 create mode 100644 ggml/src/ggml-qnn/ggml-qnn-impl.h

diff --git a/ggml/src/ggml-qnn/ggml-qnn-impl.h b/ggml/src/ggml-qnn/ggml-qnn-impl.h
new file mode 100644
index 0000000000000..0f0daba6e1a93
--- /dev/null
+++ b/ggml/src/ggml-qnn/ggml-qnn-impl.h
@@ -0,0 +1,609 @@
+/*
+* Copyright (c) 2023-2024 The ggml authors
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to
+* deal in the Software without restriction, including without limitation the
+* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+* sell copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*/
+#pragma once
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include <inttypes.h>
+#include <math.h>
+#include <time.h>
+#if defined(__ANDROID__) || defined(__linux__)
+#include <unistd.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/sysinfo.h>
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <map>
+#include <set>
+#include <tuple>
+#include <queue>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <chrono>
+#include <memory>
+#include <regex>
+#include <random>
+#include <functional>
+#include <unordered_map>
+#include <condition_variable>
+#include <cassert>
+#include <unordered_set>
+#include <utility>
+#include <stdatomic.h>
+#include <future>
+#if (defined __ANDROID__) || (defined ANDROID)
+#include "android/log.h"
+#endif
+
+#if defined(_WIN32)
+#include <wchar.h>
+#include <Windows.h>
+#endif
+
+#include "QnnTypes.h"
+#include "QnnCommon.h"
+#include "QnnContext.h"
+#include "QnnBackend.h"
+#include "QnnGraph.h"
+#include "QnnProperty.h"
+#include "QnnTensor.h"
+#include "QnnInterface.h"
+#include "Saver/QnnSaver.h"
+#include "System/QnnSystemInterface.h"
+#include "HTP/QnnHtpDevice.h"
+#include "HTP/QnnHtpGraph.h"
+
+#include "ggml-qnn.h"
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+
+class  qnn_instance;
+struct ggml_backend_qnn_context;
+void   ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
+
+#if 0//def NDEBUG
+#define GGMLQNN_DEBUG                           0
+#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
+#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
+#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
+#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            0
+#else
+#define GGMLQNN_DEBUG                           1  // for troubleshooting QNN backend
+#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
+#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
+#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
+#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            1
+#endif
+#define GGML_QNN_LOGBUF_LEN                     4096
+
+#define GGMLQNN_LOG_ERROR(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_WARN(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_INFO(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+
+#if GGMLQNN_DEBUG
+#define GGMLQNN_LOG_DEBUG(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+#define GGMLQNN_LOG_DEBUG(...)
+#endif
+
+#define CHECK_QNN_API(error, result)                                            \
+    do {                                                                        \
+        error = (result);                                                       \
+        if (QNN_SUCCESS != error) {                                             \
+            if (error == QNN_COMMON_ERROR_NOT_SUPPORTED) {                      \
+                GGMLQNN_LOG_WARN("WARNING: QNN feature/API not supported\n");   \
+            } else {                                                            \
+                GGMLQNN_LOG_INFO("QNN API error = %d(%s)\n", error, ggmlqnn_get_error_string(error));  \
+            }                                                                   \
+        }                                                                       \
+    } while (0)
+
+#define QNN_VER_PTR(x)                          (&((x).v1))
+#define RPCMEM_DEFAULT_FLAGS                    1
+#define RPCMEM_HEAP_ID_SYSTEM                   25
+
+#define DISABLE_COPY(class_name)                \
+    class_name(const class_name &) = delete;    \
+    void operator=(const class_name &) = delete
+
+#define DISABLE_MOVE(class_name)                \
+    class_name(class_name &&) = delete;         \
+    void operator=(class_name &&) = delete
+
+#define GQCGT                                   ggmlqnn_create_general_tensor
+
+#if defined(_WIN32)
+#define RTLD_GLOBAL 0x100
+#define RTLD_LOCAL  0x000
+#define RTLD_LAZY   0x000
+#define RTLD_NOW    0x001
+void *              dlopen(const char * filename, int flag);
+int                 dlclose(void * handle);
+void *              dlsym(void* handle, const char* name);
+const char *        dlerror(void);
+#endif
+
+using pfn_rpc_mem_init                          = void (*)(void);
+using pfn_rpc_mem_deinit                        = void (*)(void);
+using pfn_rpc_mem_alloc                         = void *(*)(int, uint32_t, int);
+using pfn_rpc_mem_free                          = void (*)(void *);
+using pfn_rpc_mem_to_fd                         = int (*)(void *);
+using _pfn_QnnSaver_initialize                  = decltype(QnnSaver_initialize);
+using _pfn_QnnInterface_getProviders            = decltype(QnnInterface_getProviders);
+using _pfn_QnnSystemInterface_getProviders      = decltype(QnnSystemInterface_getProviders);
+
+using qnn_res_t                                 = std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>;
+using qnn_tensors_t                             = std::vector< Qnn_Tensor_t *>;
+
+enum class ggml_qnn_profile_level {
+    profile_off     = 0,
+    profile_basic   = 1,
+    profile_detail  = 2
+};
+
+enum qcom_htp_arch {
+    NONE = 0,
+    V68 = 68,
+    V69 = 69,
+    V73 = 73,
+    V75 = 75,
+    V79 = 79,
+};
+
+enum qcom_chipset_soc_model {
+    UNKNOWN_SM = 0,
+    SM7450 = 41,  // v69, 7 Gen1
+    SM8350 = 30,  // v68, 888
+    SM8450 = 36,  // v69, SD 8 Gen 1
+    SM8475 = 42,  // v69, SD 8+ Gen 1
+    SM8550 = 43,  // v73, SD 8 Gen 2
+    SM8650 = 57,  // v75, SD 8 Gen 3
+    SM8750 = 69,  // v79, SD 8 Gen 4
+#if defined(_MSC_VER)
+    SC7280X     = 44,
+    SC8280X     = 37,
+    SC8380XP    = 60,
+#endif
+};
+
+struct qcom_socinfo {
+    uint32_t soc_model;
+    size_t htp_arch;
+    size_t vtcm_size_in_mb;
+    char soc_desc[GGML_MAX_NAME];
+};
+
+struct ggml_backend_qnn_context {
+    int device;
+    int threads;
+    char name[GGML_MAX_NAME];
+    char desc[GGML_MAX_NAME];
+    char lib[GGML_MAX_NAME];
+    qnn_instance * instance;
+    struct ggml_backend * backend;
+    QNN_INTERFACE_VER_TYPE raw_interface;
+    QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
+    struct qcom_socinfo           socinfo;
+
+    std::unique_ptr<char[]> work_data;
+    std::vector<std::future<void>> tasks;
+    size_t work_size    = 0;
+    size_t desired_size = 0;
+    int n_threads       = GGML_DEFAULT_N_THREADS;
+};
+
+struct qnn_op_caps_t {
+    const char * qnn_op_name        = nullptr;
+    const size_t input_param_count  = 0;
+    const char * qnn_param_name     = nullptr;
+};
+extern const qnn_op_caps_t k_op_caps[];
+
+#if ENABLE_QNNBACKEND_PERF
+class qnn_perf {
+public:
+    qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
+    qnn_perf() = delete;
+    qnn_perf(const qnn_perf & ) = delete;
+    qnn_perf & operator= (const qnn_perf & ) = delete;
+
+    void start() {
+        _begin_time = ggml_time_us();
+    }
+
+    void info() {
+        _end_time = ggml_time_us();
+        _duration = (_end_time - _begin_time);
+        GGMLQNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
+    }
+
+private:
+    int64_t _begin_time = 0LL;
+    int64_t _end_time   = 0LL;
+    int64_t _duration   = 0LL;
+    std::string _perf_name;
+};
+#else
+class qnn_perf {
+public:
+    qnn_perf(const std::string & perf_name) {}
+    qnn_perf() = delete;
+    qnn_perf(const qnn_perf & ) = delete;
+    qnn_perf & operator= (const qnn_perf & ) = delete;
+
+    void start() {}
+    void info() {}
+};
+#endif
+
+class qnn_interface {
+#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name)           \
+  template <typename... Args>                                     \
+  inline auto qnn_##F(Args... args) const {                       \
+    return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \
+        std::forward<Args>(args)...);                             \
+  }
+
+
+#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name)                  \
+  template <typename... Args>                                                \
+  inline auto qnn_##F(Args... args) const {                                  \
+    return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \
+        std::forward<Args>(args)...);                                        \
+  }
+
+    friend class qnn_instance;
+
+public:
+    qnn_interface() = default;
+
+    // QnnBackend
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion);
+
+    // QnnDevice
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo);
+
+    // QnnContext
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree);
+
+    // QnnGraph
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve);
+
+    // QnnLog
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel);
+
+    // QnnProfile
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree);
+
+    // QnnMem
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister);
+
+    // QnnProperty
+    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability);
+
+    // QnnTensor
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor);
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor);
+
+    // QnnSystem
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate);
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo);
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree);
+
+    void set_qnn_interface(const QnnInterface_t * qnn_interface) {
+        _qnn_interface = qnn_interface;
+    }
+
+    void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) {
+        _qnn_sys_interface = qnn_sys_interface;
+    }
+
+    uint32_t get_backend_id() const {
+        return _qnn_interface->backendId;
+    }
+
+    bool is_loaded() const {
+        return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr));
+    }
+
+private:
+    const QnnInterface_t * _qnn_interface           = nullptr;
+
+    const QnnSystemInterface_t * _qnn_sys_interface = nullptr;
+};
+
+class qnn_instance {
+public:
+    using BackendIdType = decltype(QnnInterface_t{}.backendId);
+
+    explicit qnn_instance(const std::string & lib_path, const std::string & backend_name,
+                          const std::string & model_name) :
+            _lib_path(std::move(lib_path)),
+            _backend_name(std::move(backend_name)),
+            _model_name(std::move(model_name)) {};
+
+    ~qnn_instance() {
+    }
+
+    int qnn_init(const QnnSaver_Config_t ** saver_config);
+
+    int qnn_finalize();
+
+    const qnn_interface & get_qnn_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_interface;
+    }
+
+    const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_raw_interface;
+    }
+
+    const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_raw_system_interface;
+    }
+
+    const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
+
+    const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
+
+    const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
+
+    const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
+
+    const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
+
+    const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
+
+    const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
+
+    int init_qnn_graph(const char * graph_name,
+                       bool debug,
+                       uint8_t do_node_validation = 1,
+                       const QnnGraph_Config_t ** graph_configs = nullptr
+    );
+    int init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
+
+    int finalize_qnn_graph();
+
+    bool is_valid_graph() const { return _qnn_graph_handle != nullptr; }
+
+    int init_htp_perfinfra();
+
+    int set_rpc_polling();
+
+    int set_high_performance_mode();
+
+    std::string & get_qnn_graph_name() { return _graph_name; }
+
+    bool is_rpcmem_initialized() {
+        return _rpcmem_initialized;
+    }
+
+    void set_rpcmem_initialized(bool initialized) {
+        _rpcmem_initialized = initialized;
+    }
+
+    size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
+    size_t get_rpcmem_usage() { return _rpcmem_usage; }
+
+    int32_t rpcmem_to_fd(void * buf);
+
+    int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor);
+    Qnn_MemHandle_t  register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type);
+
+    void unregister_rpcmem();
+    void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
+
+    void * alloc_rpcmem(size_t bytes, size_t alignment);
+    void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle);
+
+    void free_rpcmem(void * buf);
+    void free_rpcmem();
+
+    bool is_rpcmem_allocated(void * buf);
+
+    bool is_rpcmem_registered(Qnn_MemHandle_t handle) {
+        return _qnn_mem_set.count(handle) != 0U;
+    }
+
+    bool enable_qnn_rpc() {
+        return _enable_qnn_rpc;
+    }
+
+public:
+    std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>> _qnn_graph_map;
+
+private:
+    int load_system();
+
+    int unload_system();
+
+    int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config);
+
+    int unload_backend();
+
+    void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) {
+        _qnn_raw_interface = raw_interface;
+    }
+
+    void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) {
+        _qnn_raw_system_interface = raw_interface;
+    }
+
+    void * alloc_rpcmem_internal(size_t bytes, size_t alignment);
+
+    void probe_device_meminfo();
+
+private:
+    static constexpr const int _required_num_providers = 1;
+
+private:
+    std::string     _lib_path;
+    std::string     _backend_name;
+    std::string     _model_name; // name of prebuilt QNN model, might be used in the future
+    BackendIdType   _backend_id;
+
+    bool _debug_tensor                      = false; // flag to indicate if requested graph is to be run in debug mode
+    bool _do_node_validations               = true;  // flag to indicate whether all add_node calls need to be validated
+    QnnLog_Level_t _qnn_log_level           = QNN_LOG_LEVEL_DEBUG;
+
+    ggml_qnn_profile_level _profile_level   = ggml_qnn_profile_level::profile_detail;
+
+    void * _system_lib_handle               = nullptr;
+
+    Qnn_GraphHandle_t _qnn_graph_handle     = nullptr;
+
+    Qnn_LogHandle_t _qnn_log_handle         = nullptr;
+
+    Qnn_ProfileHandle_t _qnn_profile_handle = nullptr;
+
+    Qnn_DeviceHandle_t _qnn_device_handle   = nullptr;
+
+    Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
+
+    Qnn_ContextHandle_t _qnn_context_handle = nullptr;
+
+    QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
+
+    QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr;
+    uint32_t _qnn_power_configid            = 1;
+    uint32_t _qnn_rpc_pollingtime           = 9999; // 0-10000 us for high performing
+
+    qnn_interface _qnn_interface;
+    QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
+    QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
+
+    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
+    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
+
+    static std::mutex _init_mutex;
+    static std::unordered_map<BackendIdType, void *> _loaded_lib_handle;
+    static std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
+    static std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
+
+    std::atomic_bool _rpcmem_initialized{false};
+    pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
+    pfn_rpc_mem_free _pfn_rpc_mem_free;
+    pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
+    pfn_rpc_mem_init  _pfn_rpc_mem_init;
+    pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
+    std::unordered_map<void *, void *> _rpcmem_store_map;
+    std::unordered_map<void *, size_t> _rpcmem_usage_map;
+    size_t                             _rpcmem_usage    = 0;   // mempool usage in Mbytes
+    size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
+
+    std::string _graph_name;
+    QNNBackend _device_id;
+    void * _rpc_lib_handle      = nullptr;
+    bool       _enable_qnn_rpc  = false; //TODO:unknown issue with QNN RPC feature
+
+    DISABLE_COPY(qnn_instance);
+    DISABLE_MOVE(qnn_instance);
+};
+
+size_t         ggmlqnn_get_opcaps_size(void);
+size_t         ggmlqnn_get_op_index(const ggml_tensor * tensor);
+Qnn_Tensor_t * ggmlqnn_create_compute_tensor(const ggml_tensor * tensor);
+const char   * ggmlqnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code);
+Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype);
+void         * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op);
+void           ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & output);
+uint8_t      * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata);
+void           ggmlqnn_print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+
+Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
+                                Qnn_Param_t * params, uint32_t num_params,
+                                Qnn_Tensor_t * inputs, uint32_t num_inputs,
+                                Qnn_Tensor_t * outputs, uint32_t num_outputs);
+Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
+                                Qnn_TensorType_t qnn_tensor_type,
+                                Qnn_DataType_t qnn_data_type,
+                                uint32_t rank, uint32_t * dims,
+                                void * data, uint32_t data_size,
+                                bool b_transpose = false);
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 2818a5e3e10d2..51678f7b51ca3 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -77,6 +77,48 @@ void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char *
 // =================================================================================================
 //  section-3: general helper macro / data structure / function
 // =================================================================================================
+#if defined(_WIN32)
+static const char * last_func = nullptr;
+static long last_err;
+void * dlopen(const char * dll, int flags) {
+  HINSTANCE h = LoadLibraryA(dll);
+  if (h == NULL) {
+    last_err  = GetLastError();
+    last_func = "dlopen";
+  }
+  return h;
+}
+
+int dlclose(void * h) {
+  if (!FreeLibrary((HINSTANCE)h)) {
+    last_err  = GetLastError();
+    last_func = "dlclose";
+    return -1;
+  }
+  return 0;
+}
+
+void * dlsym(void * h, const char * name) {
+  FARPROC p = GetProcAddress((HINSTANCE)h, name);
+  if (!p) {
+    last_err  = GetLastError();
+    last_func = "dlsym";
+  }
+  return (void*)(intptr_t)p;
+}
+
+const char * dlerror(void) {
+  static char str[512];
+  if (!last_err) return nullptr;
+
+  snprintf(str, 512, "%s error #%ld", last_func, last_err);
+  last_err  = 0;
+  last_func = NULL;
+
+  return str;
+}
+#endif
+
 static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
     return offset % alignment == 0 ? offset
                                    : offset +
@@ -94,7 +136,7 @@ static size_t get_system_total_memory_in_bytes() {
     auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
 
     return pages * page_size;
-#elif defined(_WIN32) || defined(_MSC_VER)
+#elif defined(_WIN32)
     //TODO: Snapdragon based WoA(Windows on ARM)
     return 0;
 #else
@@ -112,7 +154,7 @@ static size_t get_system_free_memory_in_bytes() {
     auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
 
     return avail_pages * page_size;
-#elif defined(_WIN32) || defined(_MSC_VER)
+#elif defined(_WIN32)
     //TODO: Snapdragon based WoA(Windows on ARM)
     return 0;
 #else
@@ -143,7 +185,7 @@ static void * ggmlqnn_host_malloc(size_t n) {
         GGMLQNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__);
         return nullptr;
     }
-#elif defined(_WIN32) || defined(_MSC_VER)
+#elif defined(_WIN32)
     //TODO: Snapdragon based WoA(Windows on ARM)
     return nullptr;
 #else
@@ -569,7 +611,7 @@ static struct qcom_socinfo g_qnn_soc_info_table[] = {
                 .vtcm_size_in_mb   = 8,
                 .soc_desc          = "Qualcomm SnapDragon 8 Gen 4"},
 
-#if defined(_MSC_VER)
+#if defined(_WIN32)
         /* Qualcomm SnapDragon 7c Gen 2 */
         [SC7280X] = {
                 .soc_model         = SC7280X,
@@ -619,7 +661,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .threads              = 1,
                 .name                 = "qnn-cpu",
                 .desc                 = "Qualcomm Kryo CPU",
-#if defined(_MSC_VER)
+#if defined(_WIN32)
                 .lib                  = "QnnCpu.dll",
 #else
                 .lib                  = "libQnnCpu.so",
@@ -634,7 +676,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .threads              = 1,
                 .name                 = "qnn-gpu",
                 .desc                 = "Qualcomm Adreno GPU",
-#if defined(_MSC_VER)
+#if defined(_WIN32)
                 .lib                  = "QnnGpu.dll",
 #else
                 .lib                  = "libQnnGpu.so",
@@ -649,7 +691,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .threads              = 1,
                 .name                 = "qnn-npu",
                 .desc                 = "Qualcomm NPU(Hexagon Tensor Processor)",
-#if defined(_MSC_VER)
+#if defined(_WIN32)
                 .lib                  = "QnnHtp.dll",
 #else
                 .lib                  = "libQnnHtp.so",
@@ -1160,14 +1202,7 @@ bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor *
 
 template<typename Fn>
 Fn load_qnn_functionpointers(void * handle, const char * function_name) {
-#if defined(__ANDROID__) || defined(__linux__)
     return reinterpret_cast<Fn>(dlsym(handle, function_name));
-#elif defined(_WIN32) || defined(_MSC_VER)
-    //TODO: Snapdragon based WoA(Windows on ARM)
-    return nullptr;
-#else
-#error "ggml-qnn only support WoA, Android, Linux"
-#endif
 }
 
 std::mutex qnn_instance::_init_mutex;
@@ -1419,14 +1454,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     GGMLQNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str());
 
-#if defined(__ANDROID__) || defined(__linux__)
     void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
-#elif defined(_WIN32) || defined(_MSC_VER)
-    //TODO: Snapdragon based WoA(Windows on ARM)
-    void * lib_handle = nullptr;
-#else
-#error "ggml-qnn only support WoA, Android, Linux"
-#endif
     if (nullptr == lib_handle) {
         GGMLQNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror());
         return 1;
@@ -1529,30 +1557,24 @@ int qnn_instance::unload_backend() {
 int qnn_instance::load_system() {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
+#ifdef _WIN32
+    std::string system_lib_path = _lib_path + "QnnSystem.dll";
+#else
     std::string system_lib_path = _lib_path + "libQnnSystem.so";
+#endif
     GGMLQNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str());
 
-#if defined(__ANDROID__) || defined(__linux__)
     _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
-#elif defined(_WIN32) || defined(_MSC_VER)
-    //TODO: Snapdragon based WoA(Windows on ARM)
-    _system_lib_handle = nullptr;
-#else
-#error "ggml-qnn only support WoA, Android, Linux"
-#endif
     if (nullptr == _system_lib_handle) {
         GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
         //re-try with default path of QNN binary runtime lib
         _lib_path = "/data/local/tmp/";
-        system_lib_path = _lib_path + "libQnnSystem.so";
-#if defined(__ANDROID__) || defined(__linux__)
-        _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
-#elif defined(_WIN32) || defined(_MSC_VER)
-        //TODO: Snapdragon based WoA(Windows on ARM)
-        _system_lib_handle = nullptr;
+#ifdef _WIN32
+        system_lib_path = _lib_path + "QnnSystem.dll";
 #else
-#error "ggml-qnn only support WoA, Android, Linux"
+        system_lib_path = _lib_path + "libQnnSystem.so";
 #endif
+        _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
         if (nullptr == _system_lib_handle) {
             GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
             return 1;
@@ -1786,9 +1808,8 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
 
 #if defined(__ANDROID__) || defined(__linux__)
     _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
-#elif defined(_WIN32) || defined(_MSC_VER)
-    //TODO: Snapdragon based WoA(Windows on ARM)
-    _rpc_lib_handle = nullptr;
+#elif defined(_WIN32)
+    _rpc_lib_handle = dlopen("libcdsprpc.dll", RTLD_NOW | RTLD_LOCAL);
 #else
 #error "ggml-qnn only support WoA, Android, Linux"
 #endif
@@ -2901,6 +2922,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
         return g_qnn_mgr[device].backend;
     }
 
+#if defined(__ANDROID__)
     std::string path = qnn_lib_path;
     if (QNN_BACKEND_NPU == device) {
         if (0 == setenv("LD_LIBRARY_PATH",
@@ -2929,6 +2951,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
             GGMLQNN_LOG_ERROR("%s backend setenv failure\n", ggml_backend_qnn_get_devname(device));
         }
     }
+#endif
 
     qnn_instance * instance = nullptr;
     instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, "");

From 12a4ad14266b879d026fb7635787c551bcece595 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 08:37:25 +0800
Subject: [PATCH 093/200] ggml-qnn: merge UT code and corresponding script from
 local dev branch to make workflow easily

---
 scripts/build-run-android-minimal.sh | 240 ++++++++++++
 scripts/build-run-android.sh         |  78 +++-
 tests/ggml-qnn-ut.cpp                | 550 +++++++++++++++++++++++++++
 3 files changed, 850 insertions(+), 18 deletions(-)
 create mode 100755 scripts/build-run-android-minimal.sh
 create mode 100644 tests/ggml-qnn-ut.cpp

diff --git a/scripts/build-run-android-minimal.sh b/scripts/build-run-android-minimal.sh
new file mode 100755
index 0000000000000..1a5f362fe2083
--- /dev/null
+++ b/scripts/build-run-android-minimal.sh
@@ -0,0 +1,240 @@
+#!/bin/bash
+
+set -e
+
+PWD=`pwd`
+ANDROID_PLATFORM=android-34
+ANDROID_NDK=${PWD}/android-ndk-r26c
+REMOTE_PATH=/data/local/tmp/
+GGUF_MODEL_NAME=/sdcard/deepseek-r1-distill-qwen-1.5b-q4_0.gguf
+GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
+
+#QNN SDK could be found at:
+#https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
+QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.31.0.250130/
+
+#default is QNN NPU
+qnnbackend=2
+
+function dump_vars()
+{
+    echo -e "ANDROID_NDK:          ${ANDROID_NDK}"
+    echo -e "QNN_SDK_PATH:         ${QNN_SDK_PATH}"
+}
+
+
+function show_pwd()
+{
+    echo -e "current working path:$(pwd)\n"
+}
+
+
+function check_qnn_sdk()
+{
+    if [ ! -d ${QNN_SDK_PATH} ]; then
+        echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from ${QNN_SDK_URL}...\n"
+        exit 1
+    fi
+}
+
+
+function check_and_download_ndk()
+{
+    is_android_ndk_exist=1
+
+    if [ ! -d ${ANDROID_NDK} ]; then
+        is_android_ndk_exist=0
+    fi
+
+    if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then
+        is_android_ndk_exist=0
+    fi
+
+    if [ ${is_android_ndk_exist} -eq 0 ]; then
+
+        if [ ! -f android-ndk-r26c-linux.zip ]; then
+            wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip  https://dl.google.com/android/repository/android-ndk-r26c-linux.zip
+        fi
+
+        unzip android-ndk-r26c-linux.zip
+
+        if [ $? -ne 0 ]; then
+            printf "failed to download android ndk to %s \n" "${ANDROID_NDK}"
+            exit 1
+        fi
+
+        printf "android ndk saved to ${ANDROID_NDK} \n\n"
+    else
+        printf "android ndk already exist:${ANDROID_NDK} \n\n"
+    fi
+}
+
+
+function build_arm64
+{
+    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_USE_QNN=ON -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
+    cd out/android
+    make -j16
+    show_pwd
+
+    cd -
+}
+
+
+function remove_temp_dir()
+{
+    if [ -d out ]; then
+        echo "remove out directory in `pwd`"
+        rm -rf out
+    fi
+}
+
+
+function check_qnn_libs()
+{
+    #reuse the cached qnn libs on Android phone
+    adb shell ls ${REMOTE_PATH}/libQnnCpu.so
+    if [ $? -eq 0 ]; then
+        printf "QNN libs already exist on Android phone\n"
+    else
+        update_qnn_libs
+    fi
+}
+
+
+function update_qnn_libs()
+{
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so              ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so                 ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so                 ${REMOTE_PATH}/
+
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so                 ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so          ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so          ${REMOTE_PATH}/
+        adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so     ${REMOTE_PATH}/
+}
+
+
+function build_ggml_qnn()
+{
+    show_pwd
+    check_and_download_ndk
+    check_qnn_sdk
+    dump_vars
+    remove_temp_dir
+    build_arm64
+}
+
+
+function run_llamacli()
+{
+    check_qnn_libs
+
+    if [ -f ./out/android/bin/libggml-qnn.so ]; then
+        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
+    fi
+    adb push ./out/android/bin/llama-cli ${REMOTE_PATH}/
+    adb shell chmod +x ${REMOTE_PATH}/llama-cli
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
+
+}
+
+
+function run_llamabench()
+{
+    check_qnn_libs
+
+    if [ -f ./out/android/bin/libggml-qnn.so ]; then
+        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
+    fi
+    adb push ./out/android/bin/llama-bench ${REMOTE_PATH}/
+    adb shell chmod +x ${REMOTE_PATH}/llama-bench
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/llama-bench -mg ${qnnbackend} -m ${GGUF_MODEL_NAME}"
+
+}
+
+
+function run_test-backend-ops()
+{
+    check_qnn_libs
+
+    if [ -f ./out/android/bin/libggml-qnn.so ]; then
+        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
+    fi
+    adb push ./out/android/bin/test-backend-ops ${REMOTE_PATH}/
+    adb shell chmod +x ${REMOTE_PATH}/test-backend-ops
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test"
+
+}
+
+
+function show_usage()
+{
+    echo "Usage:"
+    echo "  $0 build"
+    echo "  $0 updateqnnlib"
+    echo "  $0 run_testop"
+    echo "  $0 run_llamacli     0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_llamabench   0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo -e "\n\n\n"
+}
+
+
+show_pwd
+
+check_qnn_sdk
+
+if [ $# == 0 ]; then
+    show_usage
+    exit 1
+elif [ $# == 1 ]; then
+    if [ "$1" == "-h" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "help" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "build" ]; then
+        build_ggml_qnn
+        exit 0
+
+    elif [ "$1" == "run_testop" ]; then
+        run_test-backend-ops
+        exit 0
+    elif [ "$1" == "updateqnnlib" ]; then
+        update_qnn_libs
+        exit 0
+    else
+        show_usage
+        exit 1
+    fi
+elif [ $# == 2 ]; then
+    qnnbackend=$2
+    if [ ${qnnbackend} -gt 3 ]; then
+        show_usage
+        exit 1
+    fi
+
+    if [ "$1" == "run_llamacli" ]; then
+        run_llamacli
+        exit 0
+    elif [ "$1" == "run_llamabench" ]; then
+        run_llamabench
+        exit 0
+    fi
+else
+    show_usage
+    exit 1
+fi
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 1a5f362fe2083..49079c9132769 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -129,32 +129,37 @@ function build_ggml_qnn()
 }
 
 
-function run_llamacli()
+function prepare_run_on_phone()
 {
+    if [ $# != 1 ]; then
+        print "invalid param"
+        return
+    fi
+    program=$1
+
     check_qnn_libs
 
     if [ -f ./out/android/bin/libggml-qnn.so ]; then
         adb push ./out/android/bin/*.so ${REMOTE_PATH}/
     fi
-    adb push ./out/android/bin/llama-cli ${REMOTE_PATH}/
-    adb shell chmod +x ${REMOTE_PATH}/llama-cli
+    adb push ./out/android/bin/${program} ${REMOTE_PATH}/
+    adb shell chmod +x ${REMOTE_PATH}/${program}
+}
+
+function run_llamacli()
+{
+    prepare_run_on_phone llama-cli
 
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
+               && ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -no-ncv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
 
 }
 
 
 function run_llamabench()
 {
-    check_qnn_libs
-
-    if [ -f ./out/android/bin/libggml-qnn.so ]; then
-        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
-    fi
-    adb push ./out/android/bin/llama-bench ${REMOTE_PATH}/
-    adb shell chmod +x ${REMOTE_PATH}/llama-bench
+    prepare_run_on_phone llama-bench
 
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
@@ -165,13 +170,7 @@ function run_llamabench()
 
 function run_test-backend-ops()
 {
-    check_qnn_libs
-
-    if [ -f ./out/android/bin/libggml-qnn.so ]; then
-        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
-    fi
-    adb push ./out/android/bin/test-backend-ops ${REMOTE_PATH}/
-    adb shell chmod +x ${REMOTE_PATH}/test-backend-ops
+    prepare_run_on_phone test-backend-ops
 
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
@@ -179,6 +178,36 @@ function run_test-backend-ops()
 
 }
 
+function run_ut_add()
+{
+    prepare_run_on_phone ggml-qnn-ut
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/ggml-qnn-ut -t GGML_OP_ADD -b $qnnbackend"
+
+}
+
+function run_ut_mulmat()
+{
+    prepare_run_on_phone ggml-qnn-ut
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/ggml-qnn-ut -t GGML_OP_MUL_MAT -b $qnnbackend"
+
+}
+
+function run_ut_mul()
+{
+    prepare_run_on_phone ggml-qnn-ut
+
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/ggml-qnn-ut -t GGML_OP_MUL -b $qnnbackend"
+
+}
+
 
 function show_usage()
 {
@@ -186,6 +215,9 @@ function show_usage()
     echo "  $0 build"
     echo "  $0 updateqnnlib"
     echo "  $0 run_testop"
+    echo "  $0 run_ut_add       0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_ut_mulmat    0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_ut_mul       0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
     echo "  $0 run_llamacli     0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
     echo "  $0 run_llamabench   0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
     echo -e "\n\n\n"
@@ -213,6 +245,7 @@ elif [ $# == 1 ]; then
     elif [ "$1" == "run_testop" ]; then
         run_test-backend-ops
         exit 0
+
     elif [ "$1" == "updateqnnlib" ]; then
         update_qnn_libs
         exit 0
@@ -233,6 +266,15 @@ elif [ $# == 2 ]; then
     elif [ "$1" == "run_llamabench" ]; then
         run_llamabench
         exit 0
+    elif [ "$1" == "run_ut_add" ]; then
+        run_ut_add
+        exit 0
+    elif [ "$1" == "run_ut_mulmat" ]; then
+        run_ut_mulmat
+        exit 0
+    elif [ "$1" == "run_ut_mul" ]; then
+        run_ut_mul
+        exit 0
     fi
 else
     show_usage
diff --git a/tests/ggml-qnn-ut.cpp b/tests/ggml-qnn-ut.cpp
new file mode 100644
index 0000000000000..ff0e96f2b00cb
--- /dev/null
+++ b/tests/ggml-qnn-ut.cpp
@@ -0,0 +1,550 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * implementation of self-made Android command line tool for verify ggml-qnn backend
+ * this file will help you to understand fundamental principle of ggml and ggml-qnn backend
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <stdarg.h>
+#include <string.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <math.h>
+#include <time.h>
+#include <unistd.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <limits.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/types.h>
+
+#include <string>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <map>
+#include <set>
+#include <tuple>
+#include <queue>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <chrono>
+#include <memory>
+#include <regex>
+#include <random>
+#include <functional>
+#include <unordered_map>
+#include <condition_variable>
+#include <cassert>
+#include <unordered_set>
+#include <utility>
+
+#include "ggml.h"
+#include "ggml-cpu.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-qnn.h"
+
+#define GGML_QNN_DEBUG      1
+#define GGML_QNN_LOGBUF_LEN 4096
+
+#define QNN_LOG_ERROR(...)  ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG,  __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define QNN_LOG_WARN(...)   ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define QNN_LOG_INFO(...)   ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+
+#if GGML_QNN_DEBUG
+#define QNN_LOG_DEBUG(...)  ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+#define QNN_LOG_DEBUG(...)
+#endif
+
+static void tensor_dump(const ggml_tensor * tensor, const char * name);
+
+#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
+
+static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
+    static std::mutex ggml_qnn_log_internal_mutex;
+    static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN];
+
+    {
+        std::lock_guard<std::mutex> lock(ggml_qnn_log_internal_mutex);
+        va_list args;
+        va_start(args, format);
+        int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line);
+        int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args);
+        if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) {
+            printf("%s", s_ggml_qnn_log_internal_buf);
+        }
+        va_end(args);
+    }
+}
+
+
+static bool ggml_graph_compute_helper(
+        struct ggml_backend * backend,
+        struct ggml_cgraph * graph,
+        std::vector<uint8_t> & buf,
+        int n_threads,
+        ggml_abort_callback abort_callback,
+        void * abort_callback_data) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, NULL);
+
+    plan.abort_callback = abort_callback;
+    plan.abort_callback_data = abort_callback_data;
+
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.data();
+    }
+
+    if (nullptr != backend)
+        return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS;
+    else
+        return ggml_graph_compute(graph, &plan);
+}
+
+
+static void tensor_dump_elements(const ggml_tensor * tensor) {
+    float value = 0;
+    std::ostringstream tmposs;
+    if (tensor->type == GGML_TYPE_F32) {
+        for (int h = 0; h < tensor->ne[3]; h++) {
+            for (int i = 0; i < tensor->ne[2]; i++) {
+                for (int j = 0; j < tensor->ne[1]; j++) {
+                    for (int k = 0; k < tensor->ne[0]; k++) {
+                        value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] +
+                                                         j * tensor->ne[0] + k];
+                        tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value
+                               << " ";
+                    }
+                    if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) {
+                        QNN_LOG_DEBUG("%s\n", tmposs.str().c_str());
+                    }
+                    tmposs.clear();
+                    tmposs.str("");
+                }
+            }
+        }
+    }
+
+    QNN_LOG_DEBUG("\n");
+}
+
+
+static void tensor_dump(const ggml_tensor * tensor, const char * name) {
+    QNN_LOG_DEBUG("dump ggml tensor %s(%s)\n", name, tensor->name);
+    QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n",
+          name,
+          tensor->type, ggml_type_name(tensor->type),
+          tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
+          tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[2]);
+    tensor_dump_elements(tensor);
+
+    QNN_LOG_DEBUG("\n");
+}
+
+
+static uint32_t get_tensor_rank(const ggml_tensor * tensor) {
+    uint32_t rank = 0;
+    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+        if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) {
+            rank++;
+        }
+    }
+    return rank;
+}
+
+
+static uint32_t get_tensor_data_size(const ggml_tensor * tensor) {
+    size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
+    size_t n_dims = get_tensor_rank(tensor);
+    for (size_t i = 1; i < n_dims; i++) {
+        data_size *= tensor->ne[i];
+    }
+
+    QNN_LOG_DEBUG("get_tensor_data_size %d", data_size);
+    QNN_LOG_DEBUG("ggml_nbytes(tensor) %d", ggml_nbytes(tensor));
+
+    return ggml_nbytes(tensor);
+}
+
+
+//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20
+static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
+    // static RNG initialization (revisit if n_threads stops being constant)
+    static const size_t n_threads = std::thread::hardware_concurrency();
+    static std::vector<std::default_random_engine> generators = []() {
+        std::random_device rd;
+        std::vector<std::default_random_engine> vec;
+        vec.reserve(n_threads);
+        //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
+        for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
+        return vec;
+    }();
+
+    size_t size = ggml_nelements(tensor);
+    std::vector<float> data(size);
+
+    auto init_thread = [&](size_t ith, size_t start, size_t end) {
+        std::uniform_real_distribution<float> distribution(min, max);
+        for (size_t i = start; i < end; i++) {
+            data[i] = distribution(generators[ith]);
+        }
+    };
+
+    std::vector<std::thread> threads;
+    threads.reserve(n_threads);
+    for (size_t i = 0; i < n_threads; i++) {
+        size_t start =     i*size/n_threads;
+        size_t end   = (i+1)*size/n_threads;
+        threads.emplace_back(init_thread, i, start, end);
+    }
+    for (auto & t : threads) {
+        t.join();
+    }
+    if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
+        ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
+    } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
+        GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
+        std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
+        std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
+        const float * im = imatrix.data();
+        if (!ggml_quantize_requires_imatrix(tensor->type)) {
+            // when the imatrix is optional, we want to test both quantization with and without imatrix
+            // use one of the random numbers to decide
+            if (data[0] > 0.5f*(min + max)) {
+                im = nullptr;
+            }
+        }
+        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
+        GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
+        ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
+    } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
+        // This is going to create some weird integers though.
+        ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
+    } else {
+        GGML_ASSERT(false);
+    }
+}
+
+
+//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310
+static void initialize_tensors(ggml_context * ctx) {
+    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
+        init_tensor_uniform(t);
+    }
+}
+
+
+static void show_usage() {
+    printf(" " \
+        "\nUsage: ggml-qnn-ut [options]\n" \
+        "\n" \
+        "Options:\n" \
+        " -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \
+        " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(QNN_GGML)\n" \
+        " ?/h print usage information\n\n"
+    );
+}
+
+
+struct ggml_backend_deleter        { void operator()(ggml_backend_t backend)       { ggml_backend_free(backend); } };
+typedef std::unique_ptr<ggml_backend,        ggml_backend_deleter>        ggml_backend_ptr;
+
+int main(int argc, char * argv[]) {
+    int64_t n_begin_time        = 0LL;
+    int64_t n_end_time          = 0LL;
+    int64_t n_duration          = 0LL;
+    size_t  ctx_size            = 0;
+    int     sizey               = 4;
+    int     sizex               = 4;
+    int num_threads             = 4;
+    int n_backend_type          = QNN_BACKEND_CPU;
+    int n_ggml_op_type          = GGML_OP_ADD;
+
+    struct ggml_context * ctx   = nullptr;
+    struct ggml_cgraph  * gf    = nullptr;
+    struct ggml_tensor  * src0  = nullptr;
+    struct ggml_tensor  * src1  = nullptr;
+    struct ggml_tensor  * dst   = nullptr;
+    ggml_backend_t backend      = nullptr;
+    ggml_backend_buffer_t buffer= nullptr;
+    ggml_type qtype             = GGML_TYPE_F32;
+    //ggml_type qtype             = GGML_TYPE_Q4_0;
+    std::vector<uint8_t> work_buffer;
+
+    for (int i = 1; i < argc; i++) {
+        if (0 == strcmp(argv[i], "-t")) {
+            if (i + 1 < argc) {
+                if (0 == memcmp(argv[i + 1], "GGML_OP_ADD", 11)) {
+                    n_ggml_op_type = GGML_OP_ADD;
+                } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) {
+                    n_ggml_op_type = GGML_OP_MUL_MAT;
+                } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) {
+                    n_ggml_op_type = GGML_OP_MUL;
+                } else {
+                    show_usage();
+                    return 1;
+                }
+                i++;
+            }
+        } else if (0 == strcmp(argv[i], "-b")) {
+            if (i + 1 < argc) {
+                int backend = atoi(argv[i + 1]);
+                if (backend <= QNN_BACKEND_GGML)
+                    n_backend_type     = backend;
+                else {
+                    show_usage();
+                    return 1;
+                }
+                i++;
+            }
+        } else {
+            show_usage();
+            return 1;
+        }
+    }
+    std::vector<ggml_backend_ptr> backends;
+    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
+    printf("Testing %zu devices\n\n", ggml_backend_dev_count());
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+
+            printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(),
+                   ggml_backend_dev_name(dev));
+
+            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+                printf("  Skipping CPU backend\n");
+                continue;
+            }
+
+            backend = ggml_backend_dev_init(dev, reinterpret_cast<const char *>(i));
+            GGML_ASSERT(backend != NULL);
+            if (backend != nullptr) {
+                printf("%s: initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+            }
+            backends.emplace_back(backend);
+
+            ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+            auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(
+                    reg, "ggml_backend_set_n_threads");
+            if (ggml_backend_set_n_threads_fn) {
+                ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
+            }
+
+            printf("  Device description: %s\n", ggml_backend_dev_description(dev));
+            size_t free, total;
+            ggml_backend_dev_memory(dev, &free, &total);
+            printf("  Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
+            printf("\n");
+    }
+
+    ggml_backend_t backend_cpu = nullptr;
+    backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+    if (nullptr == backend_cpu) {
+        QNN_LOG_DEBUG("failed to initialize cpu backend\n");
+        exit(1);
+    } else {
+        QNN_LOG_DEBUG("succeed to initialize cpu backend\n");
+    }
+    backends.emplace_back(backend_cpu);
+
+    size_t n_ok = 0;
+
+    QNN_LOG_DEBUG("enter qnn_ggml_op\n");
+    QNN_LOG_DEBUG("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type));
+
+    n_begin_time = ggml_time_us();
+    srand(time(NULL));
+
+    ctx_size += 1024 * 1024 * 32;
+    QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size,
+                    (ctx_size / 1024 / 1024));
+
+    struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ NULL,
+            /* no_alloc   =*/ 0
+    };
+
+    int idx = 0;
+    for (auto & backend_it : backends) {
+        if (idx == n_backend_type) {
+            backend = backend_it.get();
+        }
+        idx++;
+        ggml_backend_dev_t dev = ggml_backend_get_device(backend_it.get());
+        ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
+        if (reg) {
+            auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+            if (ggml_backend_set_n_threads_fn) {
+                set_n_threads_fns.emplace_back(backend_it.get(), ggml_backend_set_n_threads_fn);
+            }
+        }
+        const char * name = ggml_backend_dev_description(dev);
+        QNN_LOG_DEBUG("dev name %s\n", name);
+
+    }
+
+    if (n_backend_type != QNN_BACKEND_GGML) {
+        params.no_alloc = true;
+    }
+
+    ctx = ggml_init(params);
+    if (!ctx) {
+        QNN_LOG_ERROR("%s: ggml_init() failed\n");
+        return 2;
+    }
+
+    QNN_LOG_DEBUG("creating new tensors\n");
+    QNN_LOG_DEBUG("ggml_blck_size(%s) %d\n", ggml_type_name(qtype), ggml_blck_size(qtype));
+    QNN_LOG_DEBUG("ggml_type_size(%s) %d\n", ggml_type_name(qtype), ggml_type_size(qtype));
+    if (qtype != GGML_TYPE_F32) {
+        sizex = ggml_blck_size(qtype);
+    }
+
+    if (n_ggml_op_type == GGML_OP_ADD) {
+        src0 = ggml_new_tensor_2d(ctx, qtype, sizey, sizex);
+        src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizey, sizex);
+    } else {
+        //verify 2D matrix
+        //src0 = ggml_new_tensor_2d(ctx, qtype, 128, 64);
+        //src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 128, 2);
+        //verify 3D matrix
+        //src0 = ggml_new_tensor_3d(ctx, qtype, 128, 64, 8);
+        //src1 = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 2, 8);
+        //verify 4D matrix
+        src0 = ggml_new_tensor_4d(ctx, qtype, 256, 16, 3, 2);
+        src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 1, 6, 4);
+        //src0 = ggml_new_tensor_4d(ctx, qtype, 256, 16, 3, 2);
+        //src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
+    }
+
+    ggml_set_input(src0);
+    ggml_set_input(src1);
+    switch (n_ggml_op_type) {
+        case GGML_OP_ADD:
+            dst = ggml_add(ctx, src0, src1);
+            break;
+        case GGML_OP_MUL:
+            dst = ggml_mul(ctx, src0, src1);
+            break;
+        case GGML_OP_MUL_MAT:
+            dst = ggml_mul_mat(ctx, src0, src1);
+            break;
+        default:
+            QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type,
+                  ggml_op_name((enum ggml_op) n_ggml_op_type));
+            ggml_free(ctx);
+            ggml_backend_free(backend);
+            return 3;
+    }
+
+    ggml_set_output(dst);
+
+#ifdef GGML_USE_QNN
+    if (n_backend_type != QNN_BACKEND_GGML) {
+        QNN_LOG_DEBUG("init QNN backend %d\n", n_backend_type);
+        //re-init again
+        backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/");
+        if (nullptr == backend) {
+            QNN_LOG_ERROR("create qnn backend %d(%s) failed\n", n_backend_type, ggml_backend_qnn_get_devname(n_backend_type));
+            return 1;
+        } else {
+            QNN_LOG_INFO("create qnn backend %d(%s) succeed\n", n_backend_type, ggml_backend_qnn_get_devname(n_backend_type));
+        }
+
+        //buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
+        ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
+        buffer = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+        if (!buffer) {
+            QNN_LOG_ERROR("%s: failed to allocate backend buffer\n", __func__);
+            ggml_free(ctx);
+            ggml_backend_free(backend);
+            return 4;
+        }
+    } else {
+        QNN_LOG_DEBUG("init default cpu backend\n");
+        backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
+    }
+#endif
+
+    QNN_LOG_DEBUG("creating compute graph\n");
+    gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand(gf, dst);
+
+    if (qtype == GGML_TYPE_F32) {
+        if (n_backend_type != QNN_BACKEND_GGML) {
+            initialize_tensors(ctx);
+        } else {
+            ggml_set_f32(src0, (rand() % 100 + 1));
+            ggml_set_f32(src1, (rand() % 100 + 1));
+            ggml_set_f32(dst, 0.0f);
+        }
+        //for compare compute result between cpu backend and QNN backend
+        ggml_set_f32(src0, 1.0f);
+        ggml_set_f32(src1, 2.0f);
+        ggml_set_f32(dst, 0.0f);
+    } else {
+        initialize_tensors(ctx);
+    }
+
+    ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr);
+    if (get_tensor_data_size(dst) < (100 * 100)) {
+        QNN_LOG_DEBUG("dump result tensors:\n");
+        TENSOR_DUMP(src0);
+        TENSOR_DUMP(src1);
+        TENSOR_DUMP(dst);
+    } else {
+        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+              src0->name,
+              src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
+              src0->nb[0], src0->nb[1], src0->nb[2]);
+        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+              src1->name,
+              src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
+              src1->nb[0], src1->nb[1], src1->nb[2]);
+        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+              dst->name,
+              dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
+              dst->nb[1], dst->nb[2]);
+    }
+    //TENSOR_DUMP(dst);
+
+    ggml_free(ctx);
+    ggml_backend_buffer_free(buffer);
+    ggml_backend_free(backend);
+
+    n_end_time = ggml_time_us();
+    n_duration = (n_end_time - n_begin_time) / 1000;
+#ifdef GGML_USE_QNN
+    QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), ggml_backend_qnn_get_devname(n_backend_type), n_duration);
+#endif
+
+    return 0;
+}

From b4ee01d4a90e7beb821da1f9a35841eba274bf7a Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 09:49:51 +0800
Subject: [PATCH 094/200] ggml-qnn: merge ggml_qnn_mul_mat_4d from local dev
 branch to make workflow easily

---
 ggml/src/ggml-qnn/ggml-qnn-impl.h  |   4 +-
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 812 +++++++++++++++++++++++++++++
 ggml/src/ggml-qnn/ggml-qnn.cpp     |  10 +-
 3 files changed, 819 insertions(+), 7 deletions(-)
 create mode 100644 ggml/src/ggml-qnn/ggml-qnn-ops.cpp

diff --git a/ggml/src/ggml-qnn/ggml-qnn-impl.h b/ggml/src/ggml-qnn/ggml-qnn-impl.h
index 0f0daba6e1a93..394c35fe6b043 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-impl.h
+++ b/ggml/src/ggml-qnn/ggml-qnn-impl.h
@@ -99,7 +99,7 @@ void   ggmlqnn_log_internal(ggml_log_level level, const char * file, const char
 #else
 #define GGMLQNN_DEBUG                           1  // for troubleshooting QNN backend
 #define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
-#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
+#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          1  // enable/disable QNN's internal log
 #define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
 #define GGMLQNN_PRINT_OP_MUL_MAT_LOG            1
 #endif
@@ -226,7 +226,7 @@ struct qnn_op_caps_t {
     const size_t input_param_count  = 0;
     const char * qnn_param_name     = nullptr;
 };
-extern const qnn_op_caps_t k_op_caps[];
+extern const qnn_op_caps_t ggmlqnn_k_op_caps[];
 
 #if ENABLE_QNNBACKEND_PERF
 class qnn_perf {
diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
new file mode 100644
index 0000000000000..02b7ab7820a95
--- /dev/null
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -0,0 +1,812 @@
+/*
+ * Copyright (c) 2023-2024 The ggml authors
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include "ggml-impl.h"
+#include "ggml-common.h"
+#include "ggml-qnn-ops.h"
+
+static inline uint32_t ggmlqnn_get_tensor_data_size(const ggml_tensor * tensor) {
+    /*
+    size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
+    size_t n_dims = ggml_get_tensor_rank(tensor);
+    for (int i = 1; i < n_dims; i++) {
+        data_size *= tensor->ne[i];
+    }
+
+    return data_size;
+    */
+    return ggml_nbytes(tensor);
+}
+
+static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
+                             const ggml_tensor * src1, ggml_tensor * dst) {
+    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
+        GGMLQNN_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    qnn_instance * instance = ctx->instance;
+    if (nullptr == instance) {
+        GGMLQNN_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    return true;
+}
+
+#define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                          \
+    do {                                                                    \
+        if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {       \
+            return;                                                         \
+        }                                                                   \
+    } while (0)
+
+/*
+ * provide a general skeleton to offload ggml op to QNN backend: a single node contains 2 input
+ * tensor and 1 output tensor
+*/
+void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
+    enum ggml_status result                     = GGML_STATUS_SUCCESS;
+    bool graph_initialized                      = false;
+    qnn_instance * instance                     = nullptr;
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    Qnn_Tensor_t * p_tensor0                    = nullptr;
+    Qnn_Tensor_t * p_tensor1                    = nullptr;
+    Qnn_Tensor_t * p_tensor2                    = nullptr;
+    Qnn_Param_t qnn_params[]                    = {};
+    const ggml_tensor * src0                    = op->src[0];
+    const ggml_tensor * src1                    = op->src[1];
+    ggml_tensor * dst                           = op;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+    size_t qnn_op_index                         = ggmlqnn_get_op_index(op);
+    GGML_ASSERT(qnn_op_index < ggmlqnn_get_opcaps_size());
+    const char * qnn_op_name                    = ggmlqnn_k_op_caps[qnn_op_index].qnn_op_name;
+    std::string ggml_op_name_string             = std::string("ggml_") + ggml_op_name(op->op);
+    const char * ggml_op_name                   = ggml_op_name_string.c_str();
+
+    qnn_perf op_perf                            = qnn_perf(ggml_op_name);
+    op_perf.start();
+
+    std::string graph_name;
+    ggmlqnn_get_graphkey_from_op(op, graph_name);
+    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+        graph_initialized = true;
+        qnn_res_t & graph_item = instance->_qnn_graph_map[graph_name];
+        graph_handle = std::get<0>(graph_item);
+        qnn_tensors_t & tensor = std::get<1>(graph_item);
+        p_tensor0     = tensor[0];
+        p_tensor1     = tensor[1];
+        p_tensor2     = tensor[2];
+    } else {
+        p_tensor0 = ggmlqnn_create_compute_tensor(src0);
+        p_tensor1 = ggmlqnn_create_compute_tensor(src1);
+        p_tensor2 = ggmlqnn_create_compute_tensor(dst);
+    }
+    //ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
+
+    //ensure QNN tensor has correct tensor type
+    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
+
+    //save the original dimensions of qnn tensors
+    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
+    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
+    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
+
+    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
+
+    if (!graph_initialized) {
+        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
+        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
+            return;
+        }
+        graph_handle = instance->get_qnn_graph_handle();
+
+        if (enable_npu_rpc) {
+            QNN_VER_PTR(*p_tensor0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {.data=nullptr, .dataSize=0};
+
+            QNN_VER_PTR(*p_tensor1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {.data=nullptr, .dataSize=0};
+
+            QNN_VER_PTR(*p_tensor2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+            QNN_VER_PTR(*p_tensor2)->clientBuf = {.data=nullptr, .dataSize=0};
+        }
+
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
+
+        if (enable_npu_rpc) {
+            uint8_t * qnn_rpcbuffer_0 = ggmlqnn_create_rpc_buffer(instance, src0, p_tensor0, true);
+            uint8_t * qnn_rpcbuffer_1 = ggmlqnn_create_rpc_buffer(instance, src1, p_tensor1, true);
+            uint8_t * qnn_rpcbuffer_2 = ggmlqnn_create_rpc_buffer(instance, dst, p_tensor2, false);
+            if (nullptr == qnn_rpcbuffer_0 || nullptr == qnn_rpcbuffer_1 || nullptr == qnn_rpcbuffer_2) {
+                GGMLQNN_LOG_INFO("create rpc buffer failure\n");
+                //TODO: potential memory leak although it shouldn't happen
+                return;
+            }
+        } else {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+        }
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *p_tensor0,
+                *p_tensor1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *p_tensor2
+        };
+        Qnn_OpConfig_t op_config = {
+                QNN_OPCONFIG_VERSION_1, .v1 = {
+                        ggml_op_name,
+                        QNN_OP_PACKAGE_NAME_QTI_AISW,
+                        qnn_op_name,
+                        0,
+                        qnn_params,
+                        2,
+                        tensor_inputs,
+                        1,
+                        tensor_outputs
+                }
+        };
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                            tensor_inputs, 2,
+                                                            tensor_outputs, 1,
+                                                            nullptr, nullptr));
+
+        if (enable_npu_rpc) {
+            uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer = %p\n", qnn_rpcbuffer);
+            if (nullptr != qnn_rpcbuffer) {
+                memcpy(dst->data, qnn_rpcbuffer, ggml_nbytes(dst));
+            }
+        }
+
+        qnn_tensors_t ggml_op_add_tensors;
+        ggml_op_add_tensors.reserve(3);
+        ggml_op_add_tensors.push_back(p_tensor0);
+        ggml_op_add_tensors.push_back(p_tensor1);
+        ggml_op_add_tensors.push_back(p_tensor2);
+
+        auto  graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
+        instance->_qnn_graph_map[graph_name] = graph_item;
+    } else {
+        Qnn_DataType_t src0_qnn_type    = QNN_DATATYPE_FLOAT_32;
+        Qnn_DataType_t src1_qnn_type    = QNN_DATATYPE_FLOAT_32;
+        Qnn_DataType_t dst_qnn_type     = QNN_DATATYPE_FLOAT_32;
+
+        src0_qnn_type                   = ggmlqnn_datatype_from_ggml_datatype(src0->type);
+        src1_qnn_type                   = ggmlqnn_datatype_from_ggml_datatype(src1->type);
+        dst_qnn_type                    = ggmlqnn_datatype_from_ggml_datatype(dst->type);
+
+        uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+                                         (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
+        uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
+                                         (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
+        uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+                                         (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
+
+        QNN_VER_PTR(*p_tensor0)->dimensions  = dimensions_input_0;
+        QNN_VER_PTR(*p_tensor0)->rank        = ggml_n_dims(src0);
+        QNN_VER_PTR(*p_tensor0)->dataType    = src0_qnn_type;
+
+        QNN_VER_PTR(*p_tensor1)->dimensions  = dimensions_input_1;
+        QNN_VER_PTR(*p_tensor1)->rank        = ggml_n_dims(src1);
+        QNN_VER_PTR(*p_tensor1)->dataType    = src1_qnn_type;
+
+        QNN_VER_PTR(*p_tensor2)->dimensions  = dimensions_output;
+        QNN_VER_PTR(*p_tensor2)->rank        = ggml_n_dims(dst);
+        QNN_VER_PTR(*p_tensor2)->dataType    = dst_qnn_type;
+
+        if (enable_npu_rpc) {
+            //TODO: NPU RPC feature will failed with test-backend-ops
+            uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
+            if (nullptr != qnn_buffer_0) {
+                memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
+            }
+
+            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor1)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
+            if (nullptr != qnn_buffer_1) {
+                memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
+            }
+        } else {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+        }
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *p_tensor0,
+                *p_tensor1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *p_tensor2
+        };
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                            tensor_inputs, 2,
+                                                            tensor_outputs, 1,
+                                                            nullptr, nullptr));
+
+        if (enable_npu_rpc) {
+            //TODO:NPU RPC feature will failed with test-backend-ops
+            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
+            if (nullptr != qnn_buffer_2) {
+                memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
+            }
+        }
+    }
+
+    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
+    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
+    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
+    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
+
+#if GGMLQNN_PRINT_OP_ADD_LOG
+    op_perf.info();
+#endif
+}
+
+//FIXME:there is issue in this function
+/*
+ * this function is AI-assisted code from Grok 3.
+ * the logic of ggml_qnn_mul_mat_4d is similar to ggml_qnn_mul_mat but much more complicated
+ * than ggml_qnn_mul_mat, so it's a standalone function.
+ * it will be combined with ggml_qnn_mul_mat after bugfix
+ */
+static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error             = QNN_SUCCESS;
+    bool graph_initialized              = false;
+    qnn_perf op_perf                    = qnn_perf("ggml_qnn_mul_mat_4d");
+    qnn_instance * instance             = nullptr;
+    Qnn_GraphHandle_t graph_handle      = nullptr;
+    Qnn_Tensor_t * p_tensor0            = nullptr;
+    Qnn_Tensor_t * p_tensor1            = nullptr;
+    Qnn_Tensor_t * p_tensor2            = nullptr;
+    Qnn_Tensor_t * p_param_tensor       = nullptr;
+    Qnn_Tensor_t * p_tensor2_transpose  = nullptr;
+    Qnn_Tensor_t * p_gather0_index      = nullptr;
+    Qnn_Tensor_t * p_gather0_out        = nullptr;
+    Qnn_Tensor_t * p_gather1_index      = nullptr;
+    Qnn_Tensor_t * p_gather1_out        = nullptr;
+    const ggml_tensor * src0            = op->src[0];
+    const ggml_tensor * src1            = op->src[1];
+    ggml_tensor       * dst             = op;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    instance = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
+    op_perf.start();
+
+    uint32_t src0_rank = ggml_n_dims(src0);
+    uint32_t src1_rank = ggml_n_dims(src1);
+    GGML_ASSERT(src0_rank == src1_rank);
+    GGML_ASSERT(src0_rank == 4);
+
+    std::string graph_name;
+    ggmlqnn_get_graphkey_from_op(op, graph_name);
+    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+        graph_initialized       = true;
+        qnn_res_t &graph_item   = instance->_qnn_graph_map[graph_name];
+        graph_handle            = std::get<0>(graph_item);
+        qnn_tensors_t & tensors = std::get<1>(graph_item);
+        p_tensor0               = tensors[0];
+        p_tensor1               = tensors[1];
+        p_tensor2               = tensors[2];
+        p_param_tensor          = tensors[3];
+        p_tensor2_transpose     = tensors[4];
+        p_gather0_index         = tensors[5];
+        p_gather0_out           = tensors[6];
+        p_gather0_index         = tensors[7];
+        p_gather1_out           = tensors[8];
+    } else {
+        p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor1 = GQCGT(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor2 = GQCGT(dst, nullptr, QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+    }
+    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
+
+    //ensure QNN tensor has correct tensor type
+    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
+
+    //save the original dimensions of qnn tensors
+    uint32_t *tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
+    uint32_t *tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
+    uint32_t *tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
+
+    if (!graph_initialized) {
+        //step-1:create graph
+        GGMLQNN_LOG_DEBUG("graph name %s\n", graph_name.c_str());
+        CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
+                                                           graph_name.c_str(), NULL,
+                                                           &graph_handle));
+        //step-2:tensor definitions for offload 4D matrix mulmat to QNN backend
+        /*
+        tensor0: "p_tensor0" (permutation tensor for Transpose).
+        tensor1: "p_tensor0" (input tensor for first Gather).
+        tensor2: "p_gather0_index" (indices for first Gather).
+        tensor3: "p_gather0_out" (output of first Gather).
+        tensor4: "p_gather1_index" (indices for second Gather).
+        tensor5: "p_gather1_out" (output of second Gather).
+        tensor6: "p_tensor1" (second input for MatMul).
+        tensor7: "p_tensor2_transpose" (output of MatMul, input to Transpose).
+        tensor8: "p_tensor2" (output of Transpose).
+        */
+        uint32_t dims0[1] = {4};
+        uint32_t data0[4] = {0, static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[3])};
+        p_param_tensor = GQCGT(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, dims0, data0, src0_rank * sizeof(uint32_t));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
+
+        p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+
+        uint32_t dims2[] = {6};
+        uint32_t data2[6] = {0, static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[3]), 0, 0};
+        p_gather0_index = GQCGT(nullptr, "gather0_index", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, dims2, data2, 24);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather0_index));
+
+        uint32_t dims3[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
+        p_gather0_out = GQCGT(nullptr, "gather0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, dims3, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather0_out));
+
+        uint32_t dims4[] = {4};
+        uint32_t data4[4] = {static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[3])};
+        p_gather1_index = GQCGT(nullptr, "gather1_index", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, dims4, data4, 16);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather1_index));
+
+        uint32_t dims5[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
+        p_gather1_out = GQCGT(nullptr, "gather1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, dims5, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather1_out));
+
+        p_tensor1 = GQCGT(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+
+        uint32_t tensor2_transpose_dims[GGML_MAX_DIMS] = {};
+        p_tensor2_transpose = GQCGT(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0, true);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2_transpose));
+
+        p_tensor2 = GQCGT(dst, nullptr, QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
+
+        //step-3:gather operation 0
+        Qnn_Param_t gather0_params[]    = {{QNN_PARAMTYPE_SCALAR,"axis", .scalarParam = {QNN_DATATYPE_INT_32, .int32Value = 1}}};
+        Qnn_Tensor_t gather0_inputs[]   = {*p_tensor0, *p_gather0_index};
+        Qnn_Tensor_t gather0_outputs[]  = {*p_gather0_out};
+        Qnn_OpConfig_t gather0_op       = ggmlqnn_create_op_config("out_gather0", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_GATHER,
+                                                           gather0_params, 1, gather0_inputs, 2,
+                                                           gather0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather0_op));
+
+        //step-4:gather operation 1
+        Qnn_Param_t gather1_params[]    = {{QNN_PARAMTYPE_SCALAR,"axis", .scalarParam =  {QNN_DATATYPE_INT_32, .int32Value = 0}}};
+        Qnn_Tensor_t gather1_inputs[]   = {*p_gather0_out, *p_gather1_index};
+        Qnn_Tensor_t gather1_outputs[]  = {*p_gather1_out};
+        Qnn_OpConfig_t gather1_op       = ggmlqnn_create_op_config("out_gather1", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_GATHER,
+                                                           gather1_params, 1, gather1_inputs, 2,
+                                                           gather1_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather1_op));
+
+        //step-5:matmul operation
+        Qnn_Param_t matmul_params[]     = {{QNN_PARAMTYPE_SCALAR,"transpose_in1", .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}}};
+        Qnn_Tensor_t matmul_inputs[]    = {*p_gather1_out, *p_tensor1};
+        Qnn_Tensor_t matmul_outputs[]   = {*p_tensor2_transpose};
+        Qnn_OpConfig_t matmul_op        = ggmlqnn_create_op_config("mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
+                                                           matmul_params, 1, matmul_inputs, 2,
+                                                           matmul_outputs,
+                                                           1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
+
+        //step-6:transpose operation
+        Qnn_Param_t transpose_params[]  = { {QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_param_tensor}};
+        Qnn_Tensor_t transpose_inputs[] = {*p_tensor2_transpose};
+        Qnn_Tensor_t transpose_outputs[]= {*p_tensor2};
+        Qnn_OpConfig_t transpose_op     = ggmlqnn_create_op_config("transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
+                                                           transpose_params, 1, transpose_inputs, 1,
+                                                           transpose_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, transpose_op));
+
+        //step-7:finalize graph
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
+
+        //step-8:execute graph
+        QNN_VER_PTR(*p_tensor0)->clientBuf  = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+        QNN_VER_PTR(*p_tensor1)->clientBuf  = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*p_tensor2)->clientBuf  = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+        Qnn_Tensor_t input_tensors[]        = {*p_tensor0, *p_tensor1};
+        Qnn_Tensor_t output_tensors[]       = {*p_tensor2};
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2, output_tensors,
+                                                            1, NULL, NULL));
+
+        qnn_tensors_t ggml_op_mulmat_tensors;
+        ggml_op_mulmat_tensors.reserve(9);
+        ggml_op_mulmat_tensors.push_back(p_tensor0);
+        ggml_op_mulmat_tensors.push_back(p_tensor1);
+        ggml_op_mulmat_tensors.push_back(p_tensor2);
+        ggml_op_mulmat_tensors.push_back(p_param_tensor);
+        ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
+        ggml_op_mulmat_tensors.push_back(p_gather0_index);
+        ggml_op_mulmat_tensors.push_back(p_gather0_out);
+        ggml_op_mulmat_tensors.push_back(p_gather1_index);
+        ggml_op_mulmat_tensors.push_back(p_gather1_out);
+
+        auto graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+        instance->_qnn_graph_map[graph_name] = graph_item;
+    } else {
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *p_tensor0,
+                *p_tensor1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *p_tensor2
+        };
+        // this is the second technical approach of "how to utilize the Hexagon NPU maximally" through QNN SDK
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                            tensor_inputs, 2,
+                                                            tensor_outputs, 1,
+                                                            nullptr, nullptr));
+    }
+
+    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
+    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
+    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
+    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
+    op_perf.info();
+}
+
+/*
+ * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
+ *        using the QNN backend. this function performs matrix multiplication of the input tensor
+ *        `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
+ *        and stores the result in the destination tensor `dst`.
+ *
+ * @param backend the context which got through (ggml_backend_qnn_context *)backend->context for the
+ *                QNN backend operations.
+ * @param op      the destination tensor where the result of the matrix multiplication will be stored.
+ *
+ * @note the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
+ *       than ggml_qnn_general_node. so it's a standalone function. accordingly, this is another
+ *       typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute
+ *       time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds
+ *       of MUL_MAT to compute:
+ *       mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
+ *       mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
+ *       mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, Q6_K...)
+ *                        and src1 is F32, src0 -> f32 in src0', then src0' * src1
+*/
+void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
+    bool graph_initialized                      = false;
+    qnn_perf op_perf                            = qnn_perf("ggml_qnn_mul_mat");
+    qnn_instance * instance                     = nullptr;
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    Qnn_Tensor_t * p_tensor0                    = nullptr;
+    Qnn_Tensor_t * p_tensor1                    = nullptr;
+    Qnn_Tensor_t * p_tensor2                    = nullptr;
+    Qnn_Tensor_t * p_param_tensor               = nullptr;
+    Qnn_Tensor_t * p_tensor2_transpose          = nullptr;
+    const ggml_tensor * src0                    = op->src[0];
+    const ggml_tensor * src1                    = op->src[1];
+    ggml_tensor       * dst                     = op;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+    op_perf.start();
+
+    const enum ggml_type src0_type              = src0->type;
+    const uint32_t src0_rank                    = ggml_n_dims(src0);
+    const uint32_t src1_rank                    = ggml_n_dims(src1);
+    GGML_ASSERT(src0_rank == src1_rank);
+    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
+    //GGML_ASSERT(src0_rank != 4); //TODO: 4D matrix mulmat
+    if (4 == src0_rank) {
+        return ggml_qnn_mul_mat_4d(ctx, op);
+    }
+    void * wdata                                = ggmlqnn_type_trait(ctx, op);
+    const size_t desired_size                   = ctx->desired_size;
+
+    std::string graph_name;
+    ggmlqnn_get_graphkey_from_op(op, graph_name);
+    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+        graph_initialized       = true;
+        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
+        graph_handle            = std::get<0>(graph_item);
+        qnn_tensors_t & tensors = std::get<1>(graph_item);
+        p_tensor0               = tensors[0];
+        p_tensor1               = tensors[1];
+        p_tensor2               = tensors[2];
+        p_param_tensor          = tensors[3];
+        p_tensor2_transpose     = tensors[4];
+    } else {
+        p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor1 = GQCGT(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor2 = GQCGT(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+    }
+    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
+
+    //ensure QNN tensor has correct tensor type
+    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
+    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
+
+    //save the original dimensions of qnn tensors
+    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
+    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
+    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
+
+    if (!graph_initialized) {
+        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
+        /*
+         there are two key-points in properly handling how to offload mulmat to the QNN backend in ggml-qnn
+         1. transpose
+            a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from:
+            struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
+            which like this:
+            +---+---+
+            | 0 | 1 |
+            +---+---+
+            | 2 | 3 |
+            +---+---+
+            | 4 | 5 |
+            +---+---+
+            with
+                ne[0] = 2
+                ne[1] = 3
+            there are different dimension order between ggml tensor and qnn tensor
+
+          2. QNN's MatMul can only support input tensors with rank >= 2
+
+             in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
+             operation when offloading mulmat to QNN backend. this concise implementation will handle
+             transpose in func ggml_qnn_create_general_tensor()
+        */
+        //step-1: create qnn graph
+        error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
+                                              graph_name.c_str(), nullptr, &graph_handle);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
+            return;
+        }
+        //step-2: create param tensor for mulmat of 2d/3d/4d matrix
+        const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
+                {0},
+                {1, 0},
+                {0, 2, 1},
+                {0, 1, 3, 2},
+        };
+        uint32_t param_tensor_dims[1]   = {src0_rank};
+        p_param_tensor = GQCGT(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, param_tensor_dims, (void *)(param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
+
+        //step-3: create compute tensor from ggml tensor
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
+        if (src0_type != GGML_TYPE_F32) {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
+        } else {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+        }
+        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+
+        //step-4: create a transpose tensor
+        p_tensor2_transpose = GQCGT(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0, true);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2_transpose));
+
+        //step-5: compose qnn graph: add mat_mul node
+        Qnn_Param_t out_0_params[] = {
+                {QNN_PARAMTYPE_SCALAR,
+                 QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
+                        .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}
+                }
+        };
+
+        Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0, *p_tensor1};
+        Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
+#if 0 //leave here for easily understand code, can be removed in the future
+        Qnn_OpConfig_t out_0 = {
+                QNN_OPCONFIG_VERSION_1, .v1 =
+                        {"ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
+                         1,
+                         out_0_params,
+                         2,
+                         out_0_inputs,
+                         1,
+                         out_0_outputs}
+        };
+#else
+        Qnn_OpConfig_t out_0 = ggmlqnn_create_op_config("ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
+                                                out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
+#endif
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
+
+        //step-5: compose qnn graph: add transpose node
+        Qnn_Param_t out_trans1_0_params[] = {
+                {(Qnn_ParamType_t) 1,
+                 "perm", .tensorParam = *p_param_tensor
+                }
+        };
+        Qnn_Tensor_t out_trans1_0_inputs[]  = {*p_tensor2_transpose};
+        Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
+#if 0 //leave here for easily understand code, can be removed in the future
+        Qnn_OpConfig_t out_trans1_0 = {
+                QNN_OPCONFIG_VERSION_1,
+                .v1 =  {"ggmlqnn_mulmat_transpose_opconfig",
+                        QNN_OP_PACKAGE_NAME_QTI_AISW,
+                        QNN_OP_TRANSPOSE, 1,
+                        out_trans1_0_params,
+                        1,
+                        out_trans1_0_inputs,
+                        1,
+                        out_trans1_0_outputs}
+        };
+#else
+        Qnn_OpConfig_t out_trans1_0 = ggmlqnn_create_op_config("ggmlqnn_mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
+                                                       out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
+#endif
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
+
+        //step-6: finalize qnn graph and execute qnn graph
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+        Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0, *p_tensor1};
+        Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                            input_tensors_0, 2,
+                                                            output_tensors_0, 1,
+                                                            nullptr, nullptr));
+
+        qnn_tensors_t ggml_op_mulmat_tensors;
+        ggml_op_mulmat_tensors.reserve(5);
+        ggml_op_mulmat_tensors.push_back(p_tensor0);
+        ggml_op_mulmat_tensors.push_back(p_tensor1);
+        ggml_op_mulmat_tensors.push_back(p_tensor2);
+        ggml_op_mulmat_tensors.push_back(p_param_tensor);
+        ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
+        auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+        instance->_qnn_graph_map[graph_name] = graph_item;
+    } else {
+        if (src0_type != GGML_TYPE_F32) {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
+        } else {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+        }
+        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *p_tensor0,
+                *p_tensor1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *p_tensor2
+        };
+        // this is the second technical approach or another pipeline of "how to utilize the Hexagon
+        // NPU maximally" through QNN SDK, details could be found at
+        // https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                            tensor_inputs, 2,
+                                                            tensor_outputs, 1,
+                                                            nullptr, nullptr));
+    }
+
+    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
+    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
+    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
+    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
+    op_perf.info();
+}
+
+void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+
+void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value) {
+}
+
+void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    ggml_qnn_dup(ctx, dst);
+}
+
+void ggml_qnn_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
+
+void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+}
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 51678f7b51ca3..3b59956009398 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -703,7 +703,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .socinfo              = {}},
 };
 
-const qnn_op_caps_t k_op_caps[] = {
+const qnn_op_caps_t ggmlqnn_k_op_caps[] = {
         {}, // GGML_OP_NONE
         {}, // GGML_OP_DUP
         {
@@ -1152,7 +1152,7 @@ static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & o
 }
 
 size_t ggmlqnn_get_opcaps_size() {
-    return std::size(k_op_caps);
+    return std::size(ggmlqnn_k_op_caps);
 }
 
 size_t ggmlqnn_get_op_index(const ggml_tensor * tensor) {
@@ -1165,8 +1165,8 @@ size_t ggmlqnn_get_op_index(const ggml_tensor * tensor) {
 
 static size_t ggmlqnn_get_op_input_param_count(const ggml_tensor * op) {
     auto op_index = ggmlqnn_get_op_index(op);
-    GGML_ASSERT(op_index < std::size(k_op_caps));
-    return k_op_caps[op_index].input_param_count;
+    GGML_ASSERT(op_index < std::size(ggmlqnn_k_op_caps));
+    return ggmlqnn_k_op_caps[op_index].input_param_count;
 }
 
 void ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & output) {
@@ -1701,7 +1701,7 @@ static void ggml_qnn_logcallback(const char * fmt,
         std::lock_guard<std::mutex> lock(log_mutex);
         memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN);
         vsnprintf(reinterpret_cast<char *const>(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp);
-        GGMLQNN_LOG_INFO("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
+        GGMLQNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
     }
 }
 #else

From 33643a9b20f295a3f5fad23106ddfda3f906a504 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 12:06:59 +0800
Subject: [PATCH 095/200] ggml-qnn: submit AI-assisted ggml_qnn_mul_mat_4d(not
 worked currently) which generated by Grok 3

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 339 +++++++++++++----------------
 tests/ggml-qnn-ut.cpp              |   6 +-
 2 files changed, 150 insertions(+), 195 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 02b7ab7820a95..2553ff78f9c7d 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -278,216 +278,171 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
 #endif
 }
 
-//FIXME:there is issue in this function
+//TODO:there is issue in this function
 /*
- * this function is AI-assisted code from Grok 3.
+ * this function is AI-assisted code from Grok 3 for purpose of 4d mulmat UT in ggml-qnn-ut.cpp
+ * ./scripts/build-run-android.sh run_ut_mulmat 0
+ * ./scripts/build-run-android.sh run_ut_mulmat 1
+ * ./scripts/build-run-android.sh run_ut_mulmat 2
+ *
  * the logic of ggml_qnn_mul_mat_4d is similar to ggml_qnn_mul_mat but much more complicated
  * than ggml_qnn_mul_mat, so it's a standalone function.
  * it will be combined with ggml_qnn_mul_mat after bugfix
  */
-static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
-    Qnn_ErrorHandle_t error             = QNN_SUCCESS;
-    bool graph_initialized              = false;
-    qnn_perf op_perf                    = qnn_perf("ggml_qnn_mul_mat_4d");
-    qnn_instance * instance             = nullptr;
-    Qnn_GraphHandle_t graph_handle      = nullptr;
-    Qnn_Tensor_t * p_tensor0            = nullptr;
-    Qnn_Tensor_t * p_tensor1            = nullptr;
-    Qnn_Tensor_t * p_tensor2            = nullptr;
-    Qnn_Tensor_t * p_param_tensor       = nullptr;
-    Qnn_Tensor_t * p_tensor2_transpose  = nullptr;
-    Qnn_Tensor_t * p_gather0_index      = nullptr;
-    Qnn_Tensor_t * p_gather0_out        = nullptr;
-    Qnn_Tensor_t * p_gather1_index      = nullptr;
-    Qnn_Tensor_t * p_gather1_out        = nullptr;
-    const ggml_tensor * src0            = op->src[0];
-    const ggml_tensor * src1            = op->src[1];
-    ggml_tensor       * dst             = op;
+static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    qnn_perf op_perf = qnn_perf("ggml_qnn_mul_mat_4d");
+    qnn_instance *instance = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
+
+    const ggml_tensor *src0 = op->src[0]; // e.g., [256, 16, 3, 2] or [256,16,3,2]
+    const ggml_tensor *src1 = op->src[1]; // e.g., [256, 1, 6, 4]  or [256,16,3, 2]
+    ggml_tensor *dst = op;                // e.g., [16, 1, 6, 4]   or [16,16,3, 2]
 
     GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-    instance = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
+    GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4);
     op_perf.start();
 
-    uint32_t src0_rank = ggml_n_dims(src0);
-    uint32_t src1_rank = ggml_n_dims(src1);
-    GGML_ASSERT(src0_rank == src1_rank);
-    GGML_ASSERT(src0_rank == 4);
-
     std::string graph_name;
     ggmlqnn_get_graphkey_from_op(op, graph_name);
-    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
-        graph_initialized       = true;
-        qnn_res_t &graph_item   = instance->_qnn_graph_map[graph_name];
-        graph_handle            = std::get<0>(graph_item);
-        qnn_tensors_t & tensors = std::get<1>(graph_item);
-        p_tensor0               = tensors[0];
-        p_tensor1               = tensors[1];
-        p_tensor2               = tensors[2];
-        p_param_tensor          = tensors[3];
-        p_tensor2_transpose     = tensors[4];
-        p_gather0_index         = tensors[5];
-        p_gather0_out           = tensors[6];
-        p_gather0_index         = tensors[7];
-        p_gather1_out           = tensors[8];
-    } else {
-        p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor1 = GQCGT(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor2 = GQCGT(dst, nullptr, QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-    }
+    GGMLQNN_LOG_DEBUG("graph name %s\n", graph_name.c_str());
     ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
 
-    //ensure QNN tensor has correct tensor type
-    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
-
-    //save the original dimensions of qnn tensors
-    uint32_t *tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
-    uint32_t *tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
-    uint32_t *tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
-
-    if (!graph_initialized) {
-        //step-1:create graph
-        GGMLQNN_LOG_DEBUG("graph name %s\n", graph_name.c_str());
-        CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
-                                                           graph_name.c_str(), NULL,
-                                                           &graph_handle));
-        //step-2:tensor definitions for offload 4D matrix mulmat to QNN backend
-        /*
-        tensor0: "p_tensor0" (permutation tensor for Transpose).
-        tensor1: "p_tensor0" (input tensor for first Gather).
-        tensor2: "p_gather0_index" (indices for first Gather).
-        tensor3: "p_gather0_out" (output of first Gather).
-        tensor4: "p_gather1_index" (indices for second Gather).
-        tensor5: "p_gather1_out" (output of second Gather).
-        tensor6: "p_tensor1" (second input for MatMul).
-        tensor7: "p_tensor2_transpose" (output of MatMul, input to Transpose).
-        tensor8: "p_tensor2" (output of Transpose).
-        */
-        uint32_t dims0[1] = {4};
-        uint32_t data0[4] = {0, static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[3])};
-        p_param_tensor = GQCGT(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, dims0, data0, src0_rank * sizeof(uint32_t));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
-
-        p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
-
-        uint32_t dims2[] = {6};
-        uint32_t data2[6] = {0, static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[3]), 0, 0};
-        p_gather0_index = GQCGT(nullptr, "gather0_index", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, dims2, data2, 24);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather0_index));
-
-        uint32_t dims3[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
-        p_gather0_out = GQCGT(nullptr, "gather0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, dims3, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather0_out));
-
-        uint32_t dims4[] = {4};
-        uint32_t data4[4] = {static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[3])};
-        p_gather1_index = GQCGT(nullptr, "gather1_index", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, dims4, data4, 16);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather1_index));
-
-        uint32_t dims5[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
-        p_gather1_out = GQCGT(nullptr, "gather1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, dims5, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather1_out));
-
-        p_tensor1 = GQCGT(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
-
-        uint32_t tensor2_transpose_dims[GGML_MAX_DIMS] = {};
-        p_tensor2_transpose = GQCGT(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0, true);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2_transpose));
-
-        p_tensor2 = GQCGT(dst, nullptr, QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
-
-        //step-3:gather operation 0
-        Qnn_Param_t gather0_params[]    = {{QNN_PARAMTYPE_SCALAR,"axis", .scalarParam = {QNN_DATATYPE_INT_32, .int32Value = 1}}};
-        Qnn_Tensor_t gather0_inputs[]   = {*p_tensor0, *p_gather0_index};
-        Qnn_Tensor_t gather0_outputs[]  = {*p_gather0_out};
-        Qnn_OpConfig_t gather0_op       = ggmlqnn_create_op_config("out_gather0", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_GATHER,
-                                                           gather0_params, 1, gather0_inputs, 2,
-                                                           gather0_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather0_op));
-
-        //step-4:gather operation 1
-        Qnn_Param_t gather1_params[]    = {{QNN_PARAMTYPE_SCALAR,"axis", .scalarParam =  {QNN_DATATYPE_INT_32, .int32Value = 0}}};
-        Qnn_Tensor_t gather1_inputs[]   = {*p_gather0_out, *p_gather1_index};
-        Qnn_Tensor_t gather1_outputs[]  = {*p_gather1_out};
-        Qnn_OpConfig_t gather1_op       = ggmlqnn_create_op_config("out_gather1", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_GATHER,
-                                                           gather1_params, 1, gather1_inputs, 2,
-                                                           gather1_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather1_op));
-
-        //step-5:matmul operation
-        Qnn_Param_t matmul_params[]     = {{QNN_PARAMTYPE_SCALAR,"transpose_in1", .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}}};
-        Qnn_Tensor_t matmul_inputs[]    = {*p_gather1_out, *p_tensor1};
-        Qnn_Tensor_t matmul_outputs[]   = {*p_tensor2_transpose};
-        Qnn_OpConfig_t matmul_op        = ggmlqnn_create_op_config("mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
-                                                           matmul_params, 1, matmul_inputs, 2,
-                                                           matmul_outputs,
-                                                           1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
-
-        //step-6:transpose operation
-        Qnn_Param_t transpose_params[]  = { {QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_param_tensor}};
-        Qnn_Tensor_t transpose_inputs[] = {*p_tensor2_transpose};
-        Qnn_Tensor_t transpose_outputs[]= {*p_tensor2};
-        Qnn_OpConfig_t transpose_op     = ggmlqnn_create_op_config("transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
+    Qnn_GraphHandle_t graph_handle = nullptr;
+    Qnn_Tensor_t *p_tensor0 = nullptr;        // src0 input
+    Qnn_Tensor_t *p_gather_out = nullptr;     // After Gather
+    Qnn_Tensor_t *p_gather_indices = nullptr; // Gather indices
+    Qnn_Tensor_t *p_tensor1 = nullptr;        // src1 input
+    Qnn_Tensor_t *p_matmul_out = nullptr;     // MatMul output
+    Qnn_Tensor_t *p_transpose_perm = nullptr; // Transpose permutation
+    Qnn_Tensor_t *p_tensor2 = nullptr;        // Final output
+
+
+    CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
+                                                       graph_name.c_str(), NULL,
+                                                       &graph_handle));
+
+    // Step 1: Define dimensions
+    uint32_t B = src0->ne[0];  // Batch dim
+    uint32_t M = src0->ne[1];  // Rows
+    uint32_t K0 = src0->ne[2] * src0->ne[3]; // K from src0
+    uint32_t N1 = src1->ne[1]; // From src1
+    uint32_t K1 = src1->ne[2]; // K from src1
+    uint32_t N = src1->ne[3];  // Columns
+
+    GGML_ASSERT(src0->ne[0] == src1->ne[0]); // Matching batch
+    GGML_ASSERT(dst->ne[0] == M); // Rows match
+
+    // src0: [B, M, K1, K2]
+    uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[0]),
+                            static_cast<uint32_t>(src0->ne[1]),
+                            static_cast<uint32_t>(src0->ne[2]),
+                            static_cast<uint32_t>(src0->ne[3])};
+    p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
+                      src0_dims, nullptr, 0);
+    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+
+    // Gather: Reshape src0 to [M, B, K0] for MatMul
+    uint32_t gather_indices_data[] = {1, 0, 2, 3}; // Permute [B, M, K1, K2] -> [M, B, K1, K2]
+    uint32_t gather_indices_dims[] = {4};
+    p_gather_indices = GQCGT(nullptr, "gather_indices", QNN_TENSOR_TYPE_STATIC,
+                             QNN_DATATYPE_UINT_32, 1,
+                             gather_indices_dims, gather_indices_data,
+                             sizeof(gather_indices_data));
+    CHECK_QNN_API(error,
+                  qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather_indices));
+
+    uint32_t gather_out_dims[] = {M, B, static_cast<uint32_t>(src0->ne[2]),
+                                  static_cast<uint32_t>(src0->ne[3])};
+    p_gather_out = GQCGT(nullptr, "gather_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32,
+                         4,
+                         gather_out_dims, nullptr, 0);
+    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather_out));
+
+    Qnn_Param_t gather_params[] = {
+            {QNN_PARAMTYPE_SCALAR, "axis", .scalarParam = {
+                    QNN_DATATYPE_INT_32, .int32Value = 0}}
+    };
+    Qnn_Tensor_t gather_inputs[] = {*p_tensor0, *p_gather_indices};
+    Qnn_Tensor_t gather_outputs[] = {*p_gather_out};
+    Qnn_OpConfig_t gather_op = ggmlqnn_create_op_config("gather", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                        QNN_OP_GATHER, gather_params, 1,
+                                                        gather_inputs, 2, gather_outputs, 1);
+    CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather_op));
+
+    // src1: [B, N1, K, N]
+    uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[0]),
+                            static_cast<uint32_t>(src1->ne[1]),
+                            static_cast<uint32_t>(src1->ne[2]),
+                            static_cast<uint32_t>(src1->ne[3])};
+    p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
+                      src1_dims, nullptr, 0);
+    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+
+    // MatMul: [M, B, K0] x [B, N1, K1, N] -> [M, N1, K1, N]
+    // Flatten for QNN: [M, B * K0] x [B * K1, N]
+    uint32_t matmul_in0_dims[] = {M, B * K0};
+    Qnn_Tensor_t matmul_in0 = *p_gather_out;
+    QNN_VER_PTR(matmul_in0)->dimensions = matmul_in0_dims;
+    QNN_VER_PTR(matmul_in0)->rank = 2;
+
+    uint32_t matmul_in1_dims[] = {B * K1, N};
+    Qnn_Tensor_t matmul_in1 = *p_tensor1;
+    QNN_VER_PTR(matmul_in1)->dimensions = matmul_in1_dims;
+    QNN_VER_PTR(matmul_in1)->rank = 2;
+
+    uint32_t matmul_out_dims[] = {M, N};
+    p_matmul_out = GQCGT(nullptr, "matmul_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32,
+                         2,
+                         matmul_out_dims, nullptr, 0);
+    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_matmul_out));
+
+    Qnn_Tensor_t matmul_inputs[] = {matmul_in0, matmul_in1};
+    Qnn_Tensor_t matmul_outputs[] = {*p_matmul_out};
+    Qnn_OpConfig_t matmul_op = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                        QNN_OP_MAT_MUL, nullptr, 0,
+                                                        matmul_inputs, 2, matmul_outputs, 1);
+    CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
+
+    // Transpose: Restore to [M, N1, K, N]
+    uint32_t perm_data[] = {0, 1, 2, 3}; // Adjust based on dst
+    uint32_t perm_dims[] = {4};
+    p_transpose_perm = GQCGT(nullptr, "transpose_perm", QNN_TENSOR_TYPE_STATIC,
+                             QNN_DATATYPE_UINT_32, 1,
+                             perm_dims, perm_data, sizeof(perm_data));
+    CHECK_QNN_API(error,
+                  qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_transpose_perm));
+
+    uint32_t dst_dims[] = {static_cast<uint32_t>(dst->ne[0]), static_cast<uint32_t>(dst->ne[1]),
+                           static_cast<uint32_t>(dst->ne[2]),
+                           static_cast<uint32_t>(dst->ne[3])};
+    p_tensor2 = GQCGT(dst, "transpose",
+                      QNN_TENSOR_TYPE_NATIVE,
+                      QNN_DATATYPE_FLOAT_32, 2,
+                      dst_dims, nullptr, 0);
+    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
+    // Transpose operation
+    Qnn_Param_t transpose_params[] = {
+            {QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_transpose_perm}};
+    Qnn_Tensor_t transpose_inputs[] = {*p_matmul_out};
+    Qnn_Tensor_t transpose_outputs[] = {*p_tensor2};
+    Qnn_OpConfig_t transpose_op = ggmlqnn_create_op_config("out_trans", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
                                                            transpose_params, 1, transpose_inputs, 1,
                                                            transpose_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, transpose_op));
-
-        //step-7:finalize graph
-        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
-
-        //step-8:execute graph
-        QNN_VER_PTR(*p_tensor0)->clientBuf  = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-        QNN_VER_PTR(*p_tensor1)->clientBuf  = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*p_tensor2)->clientBuf  = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
-        Qnn_Tensor_t input_tensors[]        = {*p_tensor0, *p_tensor1};
-        Qnn_Tensor_t output_tensors[]       = {*p_tensor2};
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2, output_tensors,
-                                                            1, NULL, NULL));
-
-        qnn_tensors_t ggml_op_mulmat_tensors;
-        ggml_op_mulmat_tensors.reserve(9);
-        ggml_op_mulmat_tensors.push_back(p_tensor0);
-        ggml_op_mulmat_tensors.push_back(p_tensor1);
-        ggml_op_mulmat_tensors.push_back(p_tensor2);
-        ggml_op_mulmat_tensors.push_back(p_param_tensor);
-        ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
-        ggml_op_mulmat_tensors.push_back(p_gather0_index);
-        ggml_op_mulmat_tensors.push_back(p_gather0_out);
-        ggml_op_mulmat_tensors.push_back(p_gather1_index);
-        ggml_op_mulmat_tensors.push_back(p_gather1_out);
+    CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, transpose_op));
 
-        auto graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
-        instance->_qnn_graph_map[graph_name] = graph_item;
-    } else {
-        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+    // Finalize graph
+    CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
 
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *p_tensor2
-        };
-        // this is the second technical approach of "how to utilize the Hexagon NPU maximally" through QNN SDK
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            tensor_inputs, 2,
-                                                            tensor_outputs, 1,
-                                                            nullptr, nullptr));
-    }
+    // Execute graph
+    QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
+    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
+    QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
 
-    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
-    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
-    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
-    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
-    op_perf.info();
+    Qnn_Tensor_t input_tensors[] = {*p_tensor0, *p_tensor1};
+    Qnn_Tensor_t output_tensors[] = {*p_tensor2};
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2,
+                                                        output_tensors, 1, NULL, NULL));
 }
 
 /*
diff --git a/tests/ggml-qnn-ut.cpp b/tests/ggml-qnn-ut.cpp
index ff0e96f2b00cb..1ab75526794e8 100644
--- a/tests/ggml-qnn-ut.cpp
+++ b/tests/ggml-qnn-ut.cpp
@@ -439,10 +439,10 @@ int main(int argc, char * argv[]) {
         //src0 = ggml_new_tensor_3d(ctx, qtype, 128, 64, 8);
         //src1 = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 2, 8);
         //verify 4D matrix
-        src0 = ggml_new_tensor_4d(ctx, qtype, 256, 16, 3, 2);
-        src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 1, 6, 4);
         //src0 = ggml_new_tensor_4d(ctx, qtype, 256, 16, 3, 2);
-        //src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
+        //src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 1, 6, 4);
+        src0 = ggml_new_tensor_4d(ctx, qtype, 256, 16, 3, 2);
+        src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
     }
 
     ggml_set_input(src0);

From 30c5719562be9eb8532f1427aa64b3c58178804d Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 13:19:49 +0800
Subject: [PATCH 096/200] ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3
 --- step2

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 214 ++++++++++++++++++++++++++++-
 1 file changed, 213 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 2553ff78f9c7d..9f835dbb7f5e0 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -289,7 +289,7 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
  * than ggml_qnn_mul_mat, so it's a standalone function.
  * it will be combined with ggml_qnn_mul_mat after bugfix
  */
-static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op) {
+static void ggml_qnn_mul_mat_4d1(ggml_backend_qnn_context *ctx, ggml_tensor *op) {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     qnn_perf op_perf = qnn_perf("ggml_qnn_mul_mat_4d");
     qnn_instance *instance = ctx->instance;
@@ -445,6 +445,218 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                         output_tensors, 1, NULL, NULL));
 }
 
+static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    bool graph_initialized = false;
+    qnn_perf op_perf = qnn_perf("ggml_qnn_mul_mat_4d");
+    qnn_instance *instance = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
+
+    const ggml_tensor *src0 = op->src[0]; // e.g., [256, 16, 3, 2] or [256,16, 3, 2]
+    const ggml_tensor *src1 = op->src[1]; // e.g., [256, 1, 6, 4] or [256, 16, 3, 2]
+    ggml_tensor *dst = op;                // e.g., [16, 1, 6, 4] or [16, 16, 3, 2]
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4);
+    op_perf.start();
+
+    std::string graph_name;
+    ggmlqnn_get_graphkey_from_op(op, graph_name);
+    GGMLQNN_LOG_DEBUG("graph name %s\n", graph_name.c_str());
+    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
+
+    Qnn_GraphHandle_t graph_handle = nullptr;
+    Qnn_Tensor_t *p_tensor0 = nullptr;         // src0 input
+    Qnn_Tensor_t *p_gather0_out = nullptr;     // After Gather on src0
+    Qnn_Tensor_t *p_gather0_indices = nullptr; // Gather indices for src0
+    Qnn_Tensor_t *p_tensor1 = nullptr;         // src1 input
+    Qnn_Tensor_t *p_gather1_out = nullptr;     // After Gather on src1
+    Qnn_Tensor_t *p_gather1_indices = nullptr; // Gather indices for src1
+    Qnn_Tensor_t *p_matmul_out = nullptr;      // MatMul output
+    Qnn_Tensor_t *p_transpose_perm = nullptr;  // Transpose permutation
+    Qnn_Tensor_t *p_tensor2 = nullptr;         // Final output
+
+    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+        graph_initialized = true;
+        qnn_res_t &graph_item = instance->_qnn_graph_map[graph_name];
+        graph_handle = std::get<0>(graph_item);
+        qnn_tensors_t &tensors = std::get<1>(graph_item);
+        p_tensor0 = tensors[0];
+        p_gather0_out = tensors[1];
+        p_gather0_indices = tensors[2];
+        p_tensor1 = tensors[3];
+        p_gather1_out = tensors[4];
+        p_gather1_indices = tensors[5];
+        p_matmul_out = tensors[6];
+        p_transpose_perm = tensors[7];
+        p_tensor2 = tensors[8];
+    } else {
+        CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
+                                                           graph_name.c_str(), NULL, &graph_handle));
+
+        // Step 1: Define dimensions (ne = logical order)
+        uint32_t B = src0->ne[0];  // Batch dim
+        uint32_t M = src0->ne[1];  // Rows
+        uint32_t K0 = src0->ne[2] * src0->ne[3]; // K from src0 (e.g., 3 * 2 = 6)
+        uint32_t N1 = src1->ne[1]; // From src1
+        uint32_t K1 = src1->ne[2]; // K from src1 (e.g., 6 or 3)
+        uint32_t N = src1->ne[3];  // Columns
+
+        GGML_ASSERT(src0->ne[0] == src1->ne[0]); // Matching batch
+        GGML_ASSERT(dst->ne[0] == M);            // Rows match
+        GGML_ASSERT(K0 == K1);                   // K must match for mul_mat
+
+        // src0: [B, M, K1, K2]
+        uint32_t src0_dims[] = {B, M, static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[3])};
+        p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
+                          src0_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+
+        // Gather on src0: [B, M, K1, K2] -> [M, B, K0] (collapse K1, K2)
+        uint32_t gather0_indices_data[] = {1, 0, 2, 3}; // [B, M, K1, K2] -> [M, B, K1, K2]
+        uint32_t gather0_indices_dims[] = {4};
+        p_gather0_indices = GQCGT(nullptr, "gather0_indices", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                  gather0_indices_dims, gather0_indices_data, sizeof(gather0_indices_data));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather0_indices));
+
+        uint32_t gather0_out_dims[] = {M, B, static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[3])};
+        p_gather0_out = GQCGT(nullptr, "gather0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
+                              gather0_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather0_out));
+
+        Qnn_Param_t gather0_params[] = {
+                {QNN_PARAMTYPE_SCALAR, "axis", .scalarParam = {QNN_DATATYPE_INT_32, .int32Value = 0}}
+        };
+        Qnn_Tensor_t gather0_inputs[] = {*p_tensor0, *p_gather0_indices};
+        Qnn_Tensor_t gather0_outputs[] = {*p_gather0_out};
+        Qnn_OpConfig_t gather0_op = ggmlqnn_create_op_config("gather0", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                             QNN_OP_GATHER, gather0_params, 1,
+                                                             gather0_inputs, 2, gather0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather0_op));
+
+        // src1: [B, N1, K, N]
+        uint32_t src1_dims[] = {B, N1, static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[3])};
+        p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
+                          src1_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+
+        // Gather on src1: [B, N1, K, N] -> [N1, B, K, N]
+        uint32_t gather1_indices_data[] = {1, 0, 2, 3}; // [B, N1, K, N] -> [N1, B, K, N]
+        uint32_t gather1_indices_dims[] = {4};
+        p_gather1_indices = GQCGT(nullptr, "gather1_indices", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                  gather1_indices_dims, gather1_indices_data, sizeof(gather1_indices_data));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather1_indices));
+
+        uint32_t gather1_out_dims[] = {N1, B, static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[3])};
+        p_gather1_out = GQCGT(nullptr, "gather1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
+                              gather1_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather1_out));
+
+        Qnn_Param_t gather1_params[] = {
+                {QNN_PARAMTYPE_SCALAR, "axis", .scalarParam = {QNN_DATATYPE_INT_32, .int32Value = 0}}
+        };
+        Qnn_Tensor_t gather1_inputs[] = {*p_tensor1, *p_gather1_indices};
+        Qnn_Tensor_t gather1_outputs[] = {*p_gather1_out};
+        Qnn_OpConfig_t gather1_op = ggmlqnn_create_op_config("gather1", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                             QNN_OP_GATHER, gather1_params, 1,
+                                                             gather1_inputs, 2, gather1_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather1_op));
+
+        // MatMul: [M, B * K0] x [B * K1, N]
+        uint32_t matmul_in0_dims[] = {M, B * K0};
+        Qnn_Tensor_t matmul_in0 = *p_gather0_out;
+        QNN_VER_PTR(matmul_in0)->dimensions = matmul_in0_dims;
+        QNN_VER_PTR(matmul_in0)->rank = 2;
+
+        uint32_t matmul_in1_dims[] = {B * K1, N};
+        Qnn_Tensor_t matmul_in1 = *p_gather1_out;
+        QNN_VER_PTR(matmul_in1)->dimensions = matmul_in1_dims;
+        QNN_VER_PTR(matmul_in1)->rank = 2;
+
+        uint32_t matmul_out_dims[] = {M, N};
+        p_matmul_out = GQCGT(nullptr, "matmul_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 2,
+                             matmul_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_matmul_out));
+
+        Qnn_Tensor_t matmul_inputs[] = {matmul_in0, matmul_in1};
+        Qnn_Tensor_t matmul_outputs[] = {*p_matmul_out};
+        Qnn_OpConfig_t matmul_op = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                            QNN_OP_MAT_MUL, nullptr, 0,
+                                                            matmul_inputs, 2, matmul_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
+
+        // Transpose: [M, N] -> [M, N1, K1, N]
+        uint32_t perm_data[] = {0, 1, 2, 3}; // Placeholder, adjust below
+        if (dst->ne[1] == N1 && dst->ne[2] == K1 && dst->ne[3] == N) {
+            perm_data[0] = 0; perm_data[1] = 1; perm_data[2] = 2; perm_data[3] = 3;
+        } else if (dst->ne[1] == 1 && dst->ne[2] == K1 && dst->ne[3] == N) {
+            perm_data[0] = 0; perm_data[1] = 2; perm_data[2] = 1; perm_data[3] = 3; // Adjust for [M, 1, K, N]
+        }
+        uint32_t perm_dims[] = {4};
+        p_transpose_perm = GQCGT(nullptr, "transpose_perm", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                 perm_dims, perm_data, sizeof(perm_data));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_transpose_perm));
+
+        uint32_t dst_dims[] = {M, N1, K1, N};
+        p_tensor2 = GQCGT(dst, "output", QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
+                          dst_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
+
+        Qnn_Param_t transpose_params[] = {
+                {QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_transpose_perm}
+        };
+        Qnn_Tensor_t transpose_inputs[] = {*p_matmul_out};
+        Qnn_Tensor_t transpose_outputs[] = {*p_tensor2};
+        Qnn_OpConfig_t transpose_op = ggmlqnn_create_op_config("transpose", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                               QNN_OP_TRANSPOSE, transpose_params, 1,
+                                                               transpose_inputs, 1, transpose_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, transpose_op));
+
+        // Finalize
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
+
+        // Cache
+        qnn_tensors_t ggml_op_mulmat_tensors = {p_tensor0, p_gather0_out, p_gather0_indices, p_tensor1,
+                                                p_gather1_out, p_gather1_indices, p_matmul_out,
+                                                p_transpose_perm, p_tensor2};
+        instance->_qnn_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+    }
+
+    // Save dimensions
+    uint32_t *tensor_0_dims = QNN_VER_PTR(*p_tensor0)->dimensions;
+    uint32_t *gather0_out_dims = QNN_VER_PTR(*p_gather0_out)->dimensions;
+    uint32_t *gather0_indices_dims = QNN_VER_PTR(*p_gather0_indices)->dimensions;
+    uint32_t *tensor_1_dims = QNN_VER_PTR(*p_tensor1)->dimensions;
+    uint32_t *gather1_out_dims = QNN_VER_PTR(*p_gather1_out)->dimensions;
+    uint32_t *gather1_indices_dims = QNN_VER_PTR(*p_gather1_indices)->dimensions;
+    uint32_t *matmul_out_dims = QNN_VER_PTR(*p_matmul_out)->dimensions;
+    uint32_t *transpose_perm_dims = QNN_VER_PTR(*p_transpose_perm)->dimensions;
+    uint32_t *tensor_2_dims = QNN_VER_PTR(*p_tensor2)->dimensions;
+
+    // Execute
+    QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
+    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
+    QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
+
+    Qnn_Tensor_t input_tensors[] = {*p_tensor0, *p_tensor1};
+    Qnn_Tensor_t output_tensors[] = {*p_tensor2};
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2,
+                                                        output_tensors, 1, NULL, NULL));
+
+    // Restore dimensions
+    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dims;
+    QNN_VER_PTR(*p_gather0_out)->dimensions = gather0_out_dims;
+    QNN_VER_PTR(*p_gather0_indices)->dimensions = gather0_indices_dims;
+    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dims;
+    QNN_VER_PTR(*p_gather1_out)->dimensions = gather1_out_dims;
+    QNN_VER_PTR(*p_gather1_indices)->dimensions = gather1_indices_dims;
+    QNN_VER_PTR(*p_matmul_out)->dimensions = matmul_out_dims;
+    QNN_VER_PTR(*p_transpose_perm)->dimensions = transpose_perm_dims;
+    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dims;
+
+    op_perf.info();
+}
+
 /*
  * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
  *        using the QNN backend. this function performs matrix multiplication of the input tensor

From 30909ddd7ac32bd75b020686204f69c12b520f20 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 13:30:22 +0800
Subject: [PATCH 097/200] ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3
 --- step3

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 231 +++++------------------------
 1 file changed, 35 insertions(+), 196 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 9f835dbb7f5e0..db5847a78ed02 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -289,162 +289,6 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
  * than ggml_qnn_mul_mat, so it's a standalone function.
  * it will be combined with ggml_qnn_mul_mat after bugfix
  */
-static void ggml_qnn_mul_mat_4d1(ggml_backend_qnn_context *ctx, ggml_tensor *op) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    qnn_perf op_perf = qnn_perf("ggml_qnn_mul_mat_4d");
-    qnn_instance *instance = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
-
-    const ggml_tensor *src0 = op->src[0]; // e.g., [256, 16, 3, 2] or [256,16,3,2]
-    const ggml_tensor *src1 = op->src[1]; // e.g., [256, 1, 6, 4]  or [256,16,3, 2]
-    ggml_tensor *dst = op;                // e.g., [16, 1, 6, 4]   or [16,16,3, 2]
-
-    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-    GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4);
-    op_perf.start();
-
-    std::string graph_name;
-    ggmlqnn_get_graphkey_from_op(op, graph_name);
-    GGMLQNN_LOG_DEBUG("graph name %s\n", graph_name.c_str());
-    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
-
-    Qnn_GraphHandle_t graph_handle = nullptr;
-    Qnn_Tensor_t *p_tensor0 = nullptr;        // src0 input
-    Qnn_Tensor_t *p_gather_out = nullptr;     // After Gather
-    Qnn_Tensor_t *p_gather_indices = nullptr; // Gather indices
-    Qnn_Tensor_t *p_tensor1 = nullptr;        // src1 input
-    Qnn_Tensor_t *p_matmul_out = nullptr;     // MatMul output
-    Qnn_Tensor_t *p_transpose_perm = nullptr; // Transpose permutation
-    Qnn_Tensor_t *p_tensor2 = nullptr;        // Final output
-
-
-    CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
-                                                       graph_name.c_str(), NULL,
-                                                       &graph_handle));
-
-    // Step 1: Define dimensions
-    uint32_t B = src0->ne[0];  // Batch dim
-    uint32_t M = src0->ne[1];  // Rows
-    uint32_t K0 = src0->ne[2] * src0->ne[3]; // K from src0
-    uint32_t N1 = src1->ne[1]; // From src1
-    uint32_t K1 = src1->ne[2]; // K from src1
-    uint32_t N = src1->ne[3];  // Columns
-
-    GGML_ASSERT(src0->ne[0] == src1->ne[0]); // Matching batch
-    GGML_ASSERT(dst->ne[0] == M); // Rows match
-
-    // src0: [B, M, K1, K2]
-    uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[0]),
-                            static_cast<uint32_t>(src0->ne[1]),
-                            static_cast<uint32_t>(src0->ne[2]),
-                            static_cast<uint32_t>(src0->ne[3])};
-    p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
-                      src0_dims, nullptr, 0);
-    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
-
-    // Gather: Reshape src0 to [M, B, K0] for MatMul
-    uint32_t gather_indices_data[] = {1, 0, 2, 3}; // Permute [B, M, K1, K2] -> [M, B, K1, K2]
-    uint32_t gather_indices_dims[] = {4};
-    p_gather_indices = GQCGT(nullptr, "gather_indices", QNN_TENSOR_TYPE_STATIC,
-                             QNN_DATATYPE_UINT_32, 1,
-                             gather_indices_dims, gather_indices_data,
-                             sizeof(gather_indices_data));
-    CHECK_QNN_API(error,
-                  qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather_indices));
-
-    uint32_t gather_out_dims[] = {M, B, static_cast<uint32_t>(src0->ne[2]),
-                                  static_cast<uint32_t>(src0->ne[3])};
-    p_gather_out = GQCGT(nullptr, "gather_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32,
-                         4,
-                         gather_out_dims, nullptr, 0);
-    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather_out));
-
-    Qnn_Param_t gather_params[] = {
-            {QNN_PARAMTYPE_SCALAR, "axis", .scalarParam = {
-                    QNN_DATATYPE_INT_32, .int32Value = 0}}
-    };
-    Qnn_Tensor_t gather_inputs[] = {*p_tensor0, *p_gather_indices};
-    Qnn_Tensor_t gather_outputs[] = {*p_gather_out};
-    Qnn_OpConfig_t gather_op = ggmlqnn_create_op_config("gather", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                        QNN_OP_GATHER, gather_params, 1,
-                                                        gather_inputs, 2, gather_outputs, 1);
-    CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather_op));
-
-    // src1: [B, N1, K, N]
-    uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[0]),
-                            static_cast<uint32_t>(src1->ne[1]),
-                            static_cast<uint32_t>(src1->ne[2]),
-                            static_cast<uint32_t>(src1->ne[3])};
-    p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
-                      src1_dims, nullptr, 0);
-    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
-
-    // MatMul: [M, B, K0] x [B, N1, K1, N] -> [M, N1, K1, N]
-    // Flatten for QNN: [M, B * K0] x [B * K1, N]
-    uint32_t matmul_in0_dims[] = {M, B * K0};
-    Qnn_Tensor_t matmul_in0 = *p_gather_out;
-    QNN_VER_PTR(matmul_in0)->dimensions = matmul_in0_dims;
-    QNN_VER_PTR(matmul_in0)->rank = 2;
-
-    uint32_t matmul_in1_dims[] = {B * K1, N};
-    Qnn_Tensor_t matmul_in1 = *p_tensor1;
-    QNN_VER_PTR(matmul_in1)->dimensions = matmul_in1_dims;
-    QNN_VER_PTR(matmul_in1)->rank = 2;
-
-    uint32_t matmul_out_dims[] = {M, N};
-    p_matmul_out = GQCGT(nullptr, "matmul_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32,
-                         2,
-                         matmul_out_dims, nullptr, 0);
-    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_matmul_out));
-
-    Qnn_Tensor_t matmul_inputs[] = {matmul_in0, matmul_in1};
-    Qnn_Tensor_t matmul_outputs[] = {*p_matmul_out};
-    Qnn_OpConfig_t matmul_op = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                        QNN_OP_MAT_MUL, nullptr, 0,
-                                                        matmul_inputs, 2, matmul_outputs, 1);
-    CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
-
-    // Transpose: Restore to [M, N1, K, N]
-    uint32_t perm_data[] = {0, 1, 2, 3}; // Adjust based on dst
-    uint32_t perm_dims[] = {4};
-    p_transpose_perm = GQCGT(nullptr, "transpose_perm", QNN_TENSOR_TYPE_STATIC,
-                             QNN_DATATYPE_UINT_32, 1,
-                             perm_dims, perm_data, sizeof(perm_data));
-    CHECK_QNN_API(error,
-                  qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_transpose_perm));
-
-    uint32_t dst_dims[] = {static_cast<uint32_t>(dst->ne[0]), static_cast<uint32_t>(dst->ne[1]),
-                           static_cast<uint32_t>(dst->ne[2]),
-                           static_cast<uint32_t>(dst->ne[3])};
-    p_tensor2 = GQCGT(dst, "transpose",
-                      QNN_TENSOR_TYPE_NATIVE,
-                      QNN_DATATYPE_FLOAT_32, 2,
-                      dst_dims, nullptr, 0);
-    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
-    // Transpose operation
-    Qnn_Param_t transpose_params[] = {
-            {QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_transpose_perm}};
-    Qnn_Tensor_t transpose_inputs[] = {*p_matmul_out};
-    Qnn_Tensor_t transpose_outputs[] = {*p_tensor2};
-    Qnn_OpConfig_t transpose_op = ggmlqnn_create_op_config("out_trans", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
-                                                           transpose_params, 1, transpose_inputs, 1,
-                                                           transpose_outputs, 1);
-    CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, transpose_op));
-
-    // Finalize graph
-    CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
-
-    // Execute graph
-    QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
-    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
-    QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
-
-    Qnn_Tensor_t input_tensors[] = {*p_tensor0, *p_tensor1};
-    Qnn_Tensor_t output_tensors[] = {*p_tensor2};
-    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2,
-                                                        output_tensors, 1, NULL, NULL));
-}
-
 static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op) {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     bool graph_initialized = false;
@@ -452,9 +296,9 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
     qnn_instance *instance = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
 
-    const ggml_tensor *src0 = op->src[0]; // e.g., [256, 16, 3, 2] or [256,16, 3, 2]
-    const ggml_tensor *src1 = op->src[1]; // e.g., [256, 1, 6, 4] or [256, 16, 3, 2]
-    ggml_tensor *dst = op;                // e.g., [16, 1, 6, 4] or [16, 16, 3, 2]
+    const ggml_tensor *src0 = op->src[0];
+    const ggml_tensor *src1 = op->src[1];
+    ggml_tensor *dst = op;            
 
     GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
     GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4);
@@ -466,15 +310,15 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
     ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
 
     Qnn_GraphHandle_t graph_handle = nullptr;
-    Qnn_Tensor_t *p_tensor0 = nullptr;         // src0 input
-    Qnn_Tensor_t *p_gather0_out = nullptr;     // After Gather on src0
-    Qnn_Tensor_t *p_gather0_indices = nullptr; // Gather indices for src0
-    Qnn_Tensor_t *p_tensor1 = nullptr;         // src1 input
-    Qnn_Tensor_t *p_gather1_out = nullptr;     // After Gather on src1
-    Qnn_Tensor_t *p_gather1_indices = nullptr; // Gather indices for src1
-    Qnn_Tensor_t *p_matmul_out = nullptr;      // MatMul output
-    Qnn_Tensor_t *p_transpose_perm = nullptr;  // Transpose permutation
-    Qnn_Tensor_t *p_tensor2 = nullptr;         // Final output
+    Qnn_Tensor_t *p_tensor0 = nullptr;
+    Qnn_Tensor_t *p_gather0_out = nullptr;
+    Qnn_Tensor_t *p_gather0_indices = nullptr;
+    Qnn_Tensor_t *p_tensor1 = nullptr;
+    Qnn_Tensor_t *p_gather1_out = nullptr;
+    Qnn_Tensor_t *p_gather1_indices = nullptr;
+    Qnn_Tensor_t *p_matmul_out = nullptr;
+    Qnn_Tensor_t *p_transpose_perm = nullptr;
+    Qnn_Tensor_t *p_tensor2 = nullptr;
 
     if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
         graph_initialized = true;
@@ -494,32 +338,32 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
                                                            graph_name.c_str(), NULL, &graph_handle));
 
-        // Step 1: Define dimensions (ne = logical order)
-        uint32_t B = src0->ne[0];  // Batch dim
-        uint32_t M = src0->ne[1];  // Rows
-        uint32_t K0 = src0->ne[2] * src0->ne[3]; // K from src0 (e.g., 3 * 2 = 6)
-        uint32_t N1 = src1->ne[1]; // From src1
-        uint32_t K1 = src1->ne[2]; // K from src1 (e.g., 6 or 3)
-        uint32_t N = src1->ne[3];  // Columns
+        // Step 1: Define dimensions (ne = [K2, K1, M, B] for src0, [N, K, N1, B] for src1)
+        uint32_t B = src0->ne[3];
+        uint32_t M = src0->ne[2];
+        uint32_t K0 = src0->ne[0] * src0->ne[1]; // K from src0
+        uint32_t N1 = src1->ne[2];
+        uint32_t K1 = src1->ne[1];
+        uint32_t N = src1->ne[0];
 
-        GGML_ASSERT(src0->ne[0] == src1->ne[0]); // Matching batch
-        GGML_ASSERT(dst->ne[0] == M);            // Rows match
-        GGML_ASSERT(K0 == K1);                   // K must match for mul_mat
+        GGML_ASSERT(src0->ne[3] == src1->ne[3]); // Matching batch
+        GGML_ASSERT(dst->ne[2] == M); // M matches dst
+        GGML_ASSERT(K0 == K1); // K must match
 
-        // src0: [B, M, K1, K2]
-        uint32_t src0_dims[] = {B, M, static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[3])};
+        // src0: [K2, K1, M, B] -> [B, M, K2, K1]
+        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[0]), static_cast<uint32_t>(src0->ne[1])};
         p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src0_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
 
-        // Gather on src0: [B, M, K1, K2] -> [M, B, K0] (collapse K1, K2)
-        uint32_t gather0_indices_data[] = {1, 0, 2, 3}; // [B, M, K1, K2] -> [M, B, K1, K2]
+        // Gather on src0: [B, M, K2, K1] -> [M, B, K0]
+        uint32_t gather0_indices_data[] = {2, 3, 0, 1}; // [K2, K1, M, B] -> [M, B, K2, K1]
         uint32_t gather0_indices_dims[] = {4};
         p_gather0_indices = GQCGT(nullptr, "gather0_indices", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
                                   gather0_indices_dims, gather0_indices_data, sizeof(gather0_indices_data));
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather0_indices));
 
-        uint32_t gather0_out_dims[] = {M, B, static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[3])};
+        uint32_t gather0_out_dims[] = {M, B, static_cast<uint32_t>(src0->ne[0]), static_cast<uint32_t>(src0->ne[1])};
         p_gather0_out = GQCGT(nullptr, "gather0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
                               gather0_out_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather0_out));
@@ -534,20 +378,20 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                              gather0_inputs, 2, gather0_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather0_op));
 
-        // src1: [B, N1, K, N]
-        uint32_t src1_dims[] = {B, N1, static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[3])};
+        // src1: [N, K, N1, B]
+        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
         p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src1_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
 
-        // Gather on src1: [B, N1, K, N] -> [N1, B, K, N]
-        uint32_t gather1_indices_data[] = {1, 0, 2, 3}; // [B, N1, K, N] -> [N1, B, K, N]
+        // Gather on src1: [N, K, N1, B] -> [N1, B, K, N]
+        uint32_t gather1_indices_data[] = {2, 3, 1, 0}; // [N, K, N1, B] -> [N1, B, K, N]
         uint32_t gather1_indices_dims[] = {4};
         p_gather1_indices = GQCGT(nullptr, "gather1_indices", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
                                   gather1_indices_dims, gather1_indices_data, sizeof(gather1_indices_data));
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather1_indices));
 
-        uint32_t gather1_out_dims[] = {N1, B, static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[3])};
+        uint32_t gather1_out_dims[] = {N1, B, static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
         p_gather1_out = GQCGT(nullptr, "gather1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
                               gather1_out_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather1_out));
@@ -585,19 +429,14 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                             matmul_inputs, 2, matmul_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
 
-        // Transpose: [M, N] -> [M, N1, K1, N]
-        uint32_t perm_data[] = {0, 1, 2, 3}; // Placeholder, adjust below
-        if (dst->ne[1] == N1 && dst->ne[2] == K1 && dst->ne[3] == N) {
-            perm_data[0] = 0; perm_data[1] = 1; perm_data[2] = 2; perm_data[3] = 3;
-        } else if (dst->ne[1] == 1 && dst->ne[2] == K1 && dst->ne[3] == N) {
-            perm_data[0] = 0; perm_data[1] = 2; perm_data[2] = 1; perm_data[3] = 3; // Adjust for [M, 1, K, N]
-        }
+        // Transpose: [M, N] -> [M, N1, K, N]
+        uint32_t perm_data[] = {0, 2, 1, 3}; // [M, N] -> [M, N1, K, N] based on dst
         uint32_t perm_dims[] = {4};
         p_transpose_perm = GQCGT(nullptr, "transpose_perm", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
                                  perm_dims, perm_data, sizeof(perm_data));
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_transpose_perm));
 
-        uint32_t dst_dims[] = {M, N1, K1, N};
+        uint32_t dst_dims[] = {static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[3]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[0])}; // Match dst->ne order
         p_tensor2 = GQCGT(dst, "output", QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
                           dst_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));

From 9ed3ecde18ef06f2c7cd59a763f8edfdd5a079a3 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 13:41:44 +0800
Subject: [PATCH 098/200] ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3
 --- step4

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 32 +++++++++++++++---------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index db5847a78ed02..1092e2ffac811 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -296,9 +296,9 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
     qnn_instance *instance = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
 
-    const ggml_tensor *src0 = op->src[0];
-    const ggml_tensor *src1 = op->src[1];
-    ggml_tensor *dst = op;            
+    const ggml_tensor *src0 = op->src[0]; // e.g., ne = [3, 2, 16, 256]
+    const ggml_tensor *src1 = op->src[1]; // e.g., ne = [4, 6, 1, 256] or [2, 3, 16, 256]
+    ggml_tensor *dst = op;                // e.g., ne = [4, 6, 1, 16] or [2, 3, 16, 16]
 
     GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
     GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4);
@@ -341,22 +341,22 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         // Step 1: Define dimensions (ne = [K2, K1, M, B] for src0, [N, K, N1, B] for src1)
         uint32_t B = src0->ne[3];
         uint32_t M = src0->ne[2];
-        uint32_t K0 = src0->ne[0] * src0->ne[1]; // K from src0
+        uint32_t K0 = src0->ne[0] * src0->ne[1]; // K from src0 (e.g., 3 * 2 = 6)
         uint32_t N1 = src1->ne[2];
-        uint32_t K1 = src1->ne[1];
-        uint32_t N = src1->ne[0];
+        uint32_t K1 = src1->ne[1] * src1->ne[0]; // K from src1 (e.g., 6 or 3 * 2 = 6)
+        uint32_t N = src1->ne[3];
 
         GGML_ASSERT(src0->ne[3] == src1->ne[3]); // Matching batch
-        GGML_ASSERT(dst->ne[2] == M); // M matches dst
-        GGML_ASSERT(K0 == K1); // K must match
+        GGML_ASSERT(dst->ne[2] == M);            // M matches dst
+        GGML_ASSERT(K0 == K1);                   // K must match
 
-        // src0: [K2, K1, M, B] -> [B, M, K2, K1]
-        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[0]), static_cast<uint32_t>(src0->ne[1])};
+        // src0: [K2, K1, M, B] -> [B, M, K0] (logical order for QNN)
+        uint32_t src0_dims[] = {B, M, static_cast<uint32_t>(src0->ne[0]), static_cast<uint32_t>(src0->ne[1])};
         p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src0_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
 
-        // Gather on src0: [B, M, K2, K1] -> [M, B, K0]
+        // Gather on src0: [K2, K1, M, B] -> [M, B, K2, K1]
         uint32_t gather0_indices_data[] = {2, 3, 0, 1}; // [K2, K1, M, B] -> [M, B, K2, K1]
         uint32_t gather0_indices_dims[] = {4};
         p_gather0_indices = GQCGT(nullptr, "gather0_indices", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
@@ -378,8 +378,8 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                              gather0_inputs, 2, gather0_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather0_op));
 
-        // src1: [N, K, N1, B]
-        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
+        // src1: [N, K, N1, B] -> [B, N1, K, N] (logical order for QNN)
+        uint32_t src1_dims[] = {B, N1, static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
         p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src1_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
@@ -429,14 +429,14 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                             matmul_inputs, 2, matmul_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
 
-        // Transpose: [M, N] -> [M, N1, K, N]
-        uint32_t perm_data[] = {0, 2, 1, 3}; // [M, N] -> [M, N1, K, N] based on dst
+        // Transpose: [M, N] -> [N, K, N1, M] to match dst->ne
+        uint32_t perm_data[] = {3, 2, 1, 0}; // Adjust to dst->ne order
         uint32_t perm_dims[] = {4};
         p_transpose_perm = GQCGT(nullptr, "transpose_perm", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
                                  perm_dims, perm_data, sizeof(perm_data));
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_transpose_perm));
 
-        uint32_t dst_dims[] = {static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[3]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[0])}; // Match dst->ne order
+        uint32_t dst_dims[] = {static_cast<uint32_t>(dst->ne[0]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[3])}; // Match dst->ne directly
         p_tensor2 = GQCGT(dst, "output", QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
                           dst_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));

From a72930ddc69f168e35f6160aca512fd6b8f6f7df Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 13:50:24 +0800
Subject: [PATCH 099/200] ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3
 --- step5

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 33 +++++++++++++++---------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 1092e2ffac811..72c241bb60d50 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -296,9 +296,9 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
     qnn_instance *instance = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
 
-    const ggml_tensor *src0 = op->src[0]; // e.g., ne = [3, 2, 16, 256]
-    const ggml_tensor *src1 = op->src[1]; // e.g., ne = [4, 6, 1, 256] or [2, 3, 16, 256]
-    ggml_tensor *dst = op;                // e.g., ne = [4, 6, 1, 16] or [2, 3, 16, 16]
+    const ggml_tensor *src0 = op->src[0];
+    const ggml_tensor *src1 = op->src[1];
+    ggml_tensor *dst = op;
 
     GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
     GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4);
@@ -307,7 +307,6 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
     std::string graph_name;
     ggmlqnn_get_graphkey_from_op(op, graph_name);
     GGMLQNN_LOG_DEBUG("graph name %s\n", graph_name.c_str());
-    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
 
     Qnn_GraphHandle_t graph_handle = nullptr;
     Qnn_Tensor_t *p_tensor0 = nullptr;
@@ -320,6 +319,8 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
     Qnn_Tensor_t *p_transpose_perm = nullptr;
     Qnn_Tensor_t *p_tensor2 = nullptr;
 
+    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst); // Keep debug line
+
     if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
         graph_initialized = true;
         qnn_res_t &graph_item = instance->_qnn_graph_map[graph_name];
@@ -341,23 +342,23 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         // Step 1: Define dimensions (ne = [K2, K1, M, B] for src0, [N, K, N1, B] for src1)
         uint32_t B = src0->ne[3];
         uint32_t M = src0->ne[2];
-        uint32_t K0 = src0->ne[0] * src0->ne[1]; // K from src0 (e.g., 3 * 2 = 6)
+        uint32_t K0 = src0->ne[0] * src0->ne[1]; // K from src0
         uint32_t N1 = src1->ne[2];
-        uint32_t K1 = src1->ne[1] * src1->ne[0]; // K from src1 (e.g., 6 or 3 * 2 = 6)
-        uint32_t N = src1->ne[3];
+        uint32_t K1 = src1->ne[1] * src1->ne[0]; // K from src1
+        uint32_t N = src1->ne[0];
 
         GGML_ASSERT(src0->ne[3] == src1->ne[3]); // Matching batch
         GGML_ASSERT(dst->ne[2] == M);            // M matches dst
         GGML_ASSERT(K0 == K1);                   // K must match
 
-        // src0: [K2, K1, M, B] -> [B, M, K0] (logical order for QNN)
-        uint32_t src0_dims[] = {B, M, static_cast<uint32_t>(src0->ne[0]), static_cast<uint32_t>(src0->ne[1])};
+        // src0: Use GGML's ne directly, let GQCGT reverse to QNN order
+        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[0]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[3])};
         p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src0_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
 
         // Gather on src0: [K2, K1, M, B] -> [M, B, K2, K1]
-        uint32_t gather0_indices_data[] = {2, 3, 0, 1}; // [K2, K1, M, B] -> [M, B, K2, K1]
+        uint32_t gather0_indices_data[] = {2, 3, 0, 1}; // Correct for QNN's reversed order
         uint32_t gather0_indices_dims[] = {4};
         p_gather0_indices = GQCGT(nullptr, "gather0_indices", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
                                   gather0_indices_dims, gather0_indices_data, sizeof(gather0_indices_data));
@@ -378,14 +379,14 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                              gather0_inputs, 2, gather0_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather0_op));
 
-        // src1: [N, K, N1, B] -> [B, N1, K, N] (logical order for QNN)
-        uint32_t src1_dims[] = {B, N1, static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
+        // src1: Use GGML's ne directly
+        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[3])};
         p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src1_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
 
         // Gather on src1: [N, K, N1, B] -> [N1, B, K, N]
-        uint32_t gather1_indices_data[] = {2, 3, 1, 0}; // [N, K, N1, B] -> [N1, B, K, N]
+        uint32_t gather1_indices_data[] = {2, 3, 1, 0}; // Correct for QNN's reversed order
         uint32_t gather1_indices_dims[] = {4};
         p_gather1_indices = GQCGT(nullptr, "gather1_indices", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
                                   gather1_indices_dims, gather1_indices_data, sizeof(gather1_indices_data));
@@ -429,14 +430,14 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                             matmul_inputs, 2, matmul_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
 
-        // Transpose: [M, N] -> [N, K, N1, M] to match dst->ne
-        uint32_t perm_data[] = {3, 2, 1, 0}; // Adjust to dst->ne order
+        // Transpose: [M, N] -> Match dst->ne
+        uint32_t perm_data[] = {3, 2, 1, 0}; // [M, N] -> [N, K, N1, M] for dst->ne
         uint32_t perm_dims[] = {4};
         p_transpose_perm = GQCGT(nullptr, "transpose_perm", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
                                  perm_dims, perm_data, sizeof(perm_data));
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_transpose_perm));
 
-        uint32_t dst_dims[] = {static_cast<uint32_t>(dst->ne[0]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[3])}; // Match dst->ne directly
+        uint32_t dst_dims[] = {static_cast<uint32_t>(dst->ne[0]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[3])};
         p_tensor2 = GQCGT(dst, "output", QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
                           dst_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));

From eab76dd65643313f7bff12f57ca9c454500e7d88 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 14:00:46 +0800
Subject: [PATCH 100/200] ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3
 --- step6

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 72c241bb60d50..55b941c4bcec9 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -342,29 +342,29 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         // Step 1: Define dimensions (ne = [K2, K1, M, B] for src0, [N, K, N1, B] for src1)
         uint32_t B = src0->ne[3];
         uint32_t M = src0->ne[2];
-        uint32_t K0 = src0->ne[0] * src0->ne[1]; // K from src0
+        uint32_t K0 = src0->ne[0] * src0->ne[1];
         uint32_t N1 = src1->ne[2];
-        uint32_t K1 = src1->ne[1] * src1->ne[0]; // K from src1
+        uint32_t K1 = src1->ne[1] * src1->ne[0];
         uint32_t N = src1->ne[0];
 
         GGML_ASSERT(src0->ne[3] == src1->ne[3]); // Matching batch
         GGML_ASSERT(dst->ne[2] == M);            // M matches dst
         GGML_ASSERT(K0 == K1);                   // K must match
 
-        // src0: Use GGML's ne directly, let GQCGT reverse to QNN order
-        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[0]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[3])};
+        // src0: [K2, K1, M, B] -> QNN sees [B, M, K1, K2] after GQCGT reversal
+        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
         p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src0_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
 
-        // Gather on src0: [K2, K1, M, B] -> [M, B, K2, K1]
-        uint32_t gather0_indices_data[] = {2, 3, 0, 1}; // Correct for QNN's reversed order
+        // Gather on src0: [B, M, K1, K2] -> [M, B, K1, K2]
+        uint32_t gather0_indices_data[] = {1, 0, 2, 3}; // [B, M, K1, K2] -> [M, B, K1, K2]
         uint32_t gather0_indices_dims[] = {4};
         p_gather0_indices = GQCGT(nullptr, "gather0_indices", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
                                   gather0_indices_dims, gather0_indices_data, sizeof(gather0_indices_data));
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather0_indices));
 
-        uint32_t gather0_out_dims[] = {M, B, static_cast<uint32_t>(src0->ne[0]), static_cast<uint32_t>(src0->ne[1])};
+        uint32_t gather0_out_dims[] = {M, B, static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
         p_gather0_out = GQCGT(nullptr, "gather0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
                               gather0_out_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather0_out));
@@ -379,14 +379,14 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                              gather0_inputs, 2, gather0_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather0_op));
 
-        // src1: Use GGML's ne directly
-        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[3])};
+        // src1: [N, K, N1, B] -> QNN sees [B, N1, K, N]
+        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
         p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src1_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
 
-        // Gather on src1: [N, K, N1, B] -> [N1, B, K, N]
-        uint32_t gather1_indices_data[] = {2, 3, 1, 0}; // Correct for QNN's reversed order
+        // Gather on src1: [B, N1, K, N] -> [N1, B, K, N]
+        uint32_t gather1_indices_data[] = {1, 0, 2, 3}; // [B, N1, K, N] -> [N1, B, K, N]
         uint32_t gather1_indices_dims[] = {4};
         p_gather1_indices = GQCGT(nullptr, "gather1_indices", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
                                   gather1_indices_dims, gather1_indices_data, sizeof(gather1_indices_data));
@@ -430,9 +430,9 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                             matmul_inputs, 2, matmul_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
 
-        // Transpose: [M, N] -> Match dst->ne
-        uint32_t perm_data[] = {3, 2, 1, 0}; // [M, N] -> [N, K, N1, M] for dst->ne
-        uint32_t perm_dims[] = {4};
+        // Transpose: [M, N] -> Match dst->ne ([N, K, N1, M] reversed)
+        uint32_t perm_data[] = {1, 0}; // [M, N] -> [N, M] for 2D
+        uint32_t perm_dims[] = {2};
         p_transpose_perm = GQCGT(nullptr, "transpose_perm", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
                                  perm_dims, perm_data, sizeof(perm_data));
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_transpose_perm));

From fe8bd7d8d2d373e9d191a06de87b77e73231dfe7 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 14:12:24 +0800
Subject: [PATCH 101/200] ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3
 --- step7

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 46 +++++++++---------------------
 1 file changed, 13 insertions(+), 33 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 55b941c4bcec9..c5f4cb9008270 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -316,7 +316,6 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
     Qnn_Tensor_t *p_gather1_out = nullptr;
     Qnn_Tensor_t *p_gather1_indices = nullptr;
     Qnn_Tensor_t *p_matmul_out = nullptr;
-    Qnn_Tensor_t *p_transpose_perm = nullptr;
     Qnn_Tensor_t *p_tensor2 = nullptr;
 
     ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst); // Keep debug line
@@ -333,13 +332,12 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         p_gather1_out = tensors[4];
         p_gather1_indices = tensors[5];
         p_matmul_out = tensors[6];
-        p_transpose_perm = tensors[7];
-        p_tensor2 = tensors[8];
+        p_tensor2 = tensors[7];
     } else {
         CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
                                                            graph_name.c_str(), NULL, &graph_handle));
 
-        // Step 1: Define dimensions (ne = [K2, K1, M, B] for src0, [N, K, N1, B] for src1)
+        // Step 1: Define dimensions
         uint32_t B = src0->ne[3];
         uint32_t M = src0->ne[2];
         uint32_t K0 = src0->ne[0] * src0->ne[1];
@@ -351,7 +349,7 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         GGML_ASSERT(dst->ne[2] == M);            // M matches dst
         GGML_ASSERT(K0 == K1);                   // K must match
 
-        // src0: [K2, K1, M, B] -> QNN sees [B, M, K1, K2] after GQCGT reversal
+        // src0: [K2, K1, M, B] -> QNN: [B, M, K1, K2]
         uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
         p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src0_dims, nullptr, 0);
@@ -379,7 +377,7 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                              gather0_inputs, 2, gather0_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather0_op));
 
-        // src1: [N, K, N1, B] -> QNN sees [B, N1, K, N]
+        // src1: [N, K, N1, B] -> QNN: [B, N1, K, N]
         uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
         p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src1_dims, nullptr, 0);
@@ -407,19 +405,19 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                              gather1_inputs, 2, gather1_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather1_op));
 
-        // MatMul: [M, B * K0] x [B * K1, N]
-        uint32_t matmul_in0_dims[] = {M, B * K0};
+        // MatMul: [M, B, K0] x [N1, B, K1] -> [M, N1, N]
+        uint32_t matmul_in0_dims[] = {M, B, K0};
         Qnn_Tensor_t matmul_in0 = *p_gather0_out;
         QNN_VER_PTR(matmul_in0)->dimensions = matmul_in0_dims;
-        QNN_VER_PTR(matmul_in0)->rank = 2;
+        QNN_VER_PTR(matmul_in0)->rank = 3;
 
-        uint32_t matmul_in1_dims[] = {B * K1, N};
+        uint32_t matmul_in1_dims[] = {N1, B, K1};
         Qnn_Tensor_t matmul_in1 = *p_gather1_out;
         QNN_VER_PTR(matmul_in1)->dimensions = matmul_in1_dims;
-        QNN_VER_PTR(matmul_in1)->rank = 2;
+        QNN_VER_PTR(matmul_in1)->rank = 3;
 
-        uint32_t matmul_out_dims[] = {M, N};
-        p_matmul_out = GQCGT(nullptr, "matmul_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 2,
+        uint32_t matmul_out_dims[] = {M, N1, N};
+        p_matmul_out = GQCGT(nullptr, "matmul_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
                              matmul_out_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_matmul_out));
 
@@ -430,35 +428,19 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                             matmul_inputs, 2, matmul_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
 
-        // Transpose: [M, N] -> Match dst->ne ([N, K, N1, M] reversed)
-        uint32_t perm_data[] = {1, 0}; // [M, N] -> [N, M] for 2D
-        uint32_t perm_dims[] = {2};
-        p_transpose_perm = GQCGT(nullptr, "transpose_perm", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
-                                 perm_dims, perm_data, sizeof(perm_data));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_transpose_perm));
-
+        // Output: Match dst->ne directly
         uint32_t dst_dims[] = {static_cast<uint32_t>(dst->ne[0]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[3])};
         p_tensor2 = GQCGT(dst, "output", QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
                           dst_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
 
-        Qnn_Param_t transpose_params[] = {
-                {QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_transpose_perm}
-        };
-        Qnn_Tensor_t transpose_inputs[] = {*p_matmul_out};
-        Qnn_Tensor_t transpose_outputs[] = {*p_tensor2};
-        Qnn_OpConfig_t transpose_op = ggmlqnn_create_op_config("transpose", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                               QNN_OP_TRANSPOSE, transpose_params, 1,
-                                                               transpose_inputs, 1, transpose_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, transpose_op));
-
         // Finalize
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
 
         // Cache
         qnn_tensors_t ggml_op_mulmat_tensors = {p_tensor0, p_gather0_out, p_gather0_indices, p_tensor1,
                                                 p_gather1_out, p_gather1_indices, p_matmul_out,
-                                                p_transpose_perm, p_tensor2};
+                                                p_tensor2};
         instance->_qnn_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
     }
 
@@ -470,7 +452,6 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
     uint32_t *gather1_out_dims = QNN_VER_PTR(*p_gather1_out)->dimensions;
     uint32_t *gather1_indices_dims = QNN_VER_PTR(*p_gather1_indices)->dimensions;
     uint32_t *matmul_out_dims = QNN_VER_PTR(*p_matmul_out)->dimensions;
-    uint32_t *transpose_perm_dims = QNN_VER_PTR(*p_transpose_perm)->dimensions;
     uint32_t *tensor_2_dims = QNN_VER_PTR(*p_tensor2)->dimensions;
 
     // Execute
@@ -491,7 +472,6 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
     QNN_VER_PTR(*p_gather1_out)->dimensions = gather1_out_dims;
     QNN_VER_PTR(*p_gather1_indices)->dimensions = gather1_indices_dims;
     QNN_VER_PTR(*p_matmul_out)->dimensions = matmul_out_dims;
-    QNN_VER_PTR(*p_transpose_perm)->dimensions = transpose_perm_dims;
     QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dims;
 
     op_perf.info();

From 1b92408b403880a7a3ce07f6e7673e6b6089c99a Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 14:17:40 +0800
Subject: [PATCH 102/200] ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3
 --- step8

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 96 +++++++-----------------------
 1 file changed, 20 insertions(+), 76 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index c5f4cb9008270..5b2552e45868b 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -310,11 +310,7 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
 
     Qnn_GraphHandle_t graph_handle = nullptr;
     Qnn_Tensor_t *p_tensor0 = nullptr;
-    Qnn_Tensor_t *p_gather0_out = nullptr;
-    Qnn_Tensor_t *p_gather0_indices = nullptr;
     Qnn_Tensor_t *p_tensor1 = nullptr;
-    Qnn_Tensor_t *p_gather1_out = nullptr;
-    Qnn_Tensor_t *p_gather1_indices = nullptr;
     Qnn_Tensor_t *p_matmul_out = nullptr;
     Qnn_Tensor_t *p_tensor2 = nullptr;
 
@@ -326,18 +322,14 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         graph_handle = std::get<0>(graph_item);
         qnn_tensors_t &tensors = std::get<1>(graph_item);
         p_tensor0 = tensors[0];
-        p_gather0_out = tensors[1];
-        p_gather0_indices = tensors[2];
-        p_tensor1 = tensors[3];
-        p_gather1_out = tensors[4];
-        p_gather1_indices = tensors[5];
-        p_matmul_out = tensors[6];
-        p_tensor2 = tensors[7];
+        p_tensor1 = tensors[1];
+        p_matmul_out = tensors[2];
+        p_tensor2 = tensors[3];
     } else {
         CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
                                                            graph_name.c_str(), NULL, &graph_handle));
 
-        // Step 1: Define dimensions
+        // Define dimensions
         uint32_t B = src0->ne[3];
         uint32_t M = src0->ne[2];
         uint32_t K0 = src0->ne[0] * src0->ne[1];
@@ -350,73 +342,29 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         GGML_ASSERT(K0 == K1);                   // K must match
 
         // src0: [K2, K1, M, B] -> QNN: [B, M, K1, K2]
-        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
+        uint32_t src0_dims[] = {B, M, static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
         p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src0_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
 
-        // Gather on src0: [B, M, K1, K2] -> [M, B, K1, K2]
-        uint32_t gather0_indices_data[] = {1, 0, 2, 3}; // [B, M, K1, K2] -> [M, B, K1, K2]
-        uint32_t gather0_indices_dims[] = {4};
-        p_gather0_indices = GQCGT(nullptr, "gather0_indices", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
-                                  gather0_indices_dims, gather0_indices_data, sizeof(gather0_indices_data));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather0_indices));
-
-        uint32_t gather0_out_dims[] = {M, B, static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
-        p_gather0_out = GQCGT(nullptr, "gather0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
-                              gather0_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather0_out));
-
-        Qnn_Param_t gather0_params[] = {
-                {QNN_PARAMTYPE_SCALAR, "axis", .scalarParam = {QNN_DATATYPE_INT_32, .int32Value = 0}}
-        };
-        Qnn_Tensor_t gather0_inputs[] = {*p_tensor0, *p_gather0_indices};
-        Qnn_Tensor_t gather0_outputs[] = {*p_gather0_out};
-        Qnn_OpConfig_t gather0_op = ggmlqnn_create_op_config("gather0", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                             QNN_OP_GATHER, gather0_params, 1,
-                                                             gather0_inputs, 2, gather0_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather0_op));
-
         // src1: [N, K, N1, B] -> QNN: [B, N1, K, N]
-        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
+        uint32_t src1_dims[] = {B, N1, static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
         p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src1_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
 
-        // Gather on src1: [B, N1, K, N] -> [N1, B, K, N]
-        uint32_t gather1_indices_data[] = {1, 0, 2, 3}; // [B, N1, K, N] -> [N1, B, K, N]
-        uint32_t gather1_indices_dims[] = {4};
-        p_gather1_indices = GQCGT(nullptr, "gather1_indices", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
-                                  gather1_indices_dims, gather1_indices_data, sizeof(gather1_indices_data));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather1_indices));
-
-        uint32_t gather1_out_dims[] = {N1, B, static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
-        p_gather1_out = GQCGT(nullptr, "gather1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
-                              gather1_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_gather1_out));
-
-        Qnn_Param_t gather1_params[] = {
-                {QNN_PARAMTYPE_SCALAR, "axis", .scalarParam = {QNN_DATATYPE_INT_32, .int32Value = 0}}
-        };
-        Qnn_Tensor_t gather1_inputs[] = {*p_tensor1, *p_gather1_indices};
-        Qnn_Tensor_t gather1_outputs[] = {*p_gather1_out};
-        Qnn_OpConfig_t gather1_op = ggmlqnn_create_op_config("gather1", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                             QNN_OP_GATHER, gather1_params, 1,
-                                                             gather1_inputs, 2, gather1_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, gather1_op));
-
-        // MatMul: [M, B, K0] x [N1, B, K1] -> [M, N1, N]
-        uint32_t matmul_in0_dims[] = {M, B, K0};
-        Qnn_Tensor_t matmul_in0 = *p_gather0_out;
+        // MatMul: [B, M, K0] x [B, N1, K1] -> [B, M, N1]
+        uint32_t matmul_in0_dims[] = {B, M, K0};
+        Qnn_Tensor_t matmul_in0 = *p_tensor0;
         QNN_VER_PTR(matmul_in0)->dimensions = matmul_in0_dims;
         QNN_VER_PTR(matmul_in0)->rank = 3;
 
-        uint32_t matmul_in1_dims[] = {N1, B, K1};
-        Qnn_Tensor_t matmul_in1 = *p_gather1_out;
+        uint32_t matmul_in1_dims[] = {B, N1, K1};
+        Qnn_Tensor_t matmul_in1 = *p_tensor1;
         QNN_VER_PTR(matmul_in1)->dimensions = matmul_in1_dims;
         QNN_VER_PTR(matmul_in1)->rank = 3;
 
-        uint32_t matmul_out_dims[] = {M, N1, N};
+        uint32_t matmul_out_dims[] = {B, M, N1};
         p_matmul_out = GQCGT(nullptr, "matmul_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
                              matmul_out_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_matmul_out));
@@ -428,7 +376,7 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                             matmul_inputs, 2, matmul_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
 
-        // Output: Match dst->ne directly
+        // Output: [M, N1, K2', K1'] matches dst->ne
         uint32_t dst_dims[] = {static_cast<uint32_t>(dst->ne[0]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[3])};
         p_tensor2 = GQCGT(dst, "output", QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
                           dst_dims, nullptr, 0);
@@ -438,19 +386,13 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
 
         // Cache
-        qnn_tensors_t ggml_op_mulmat_tensors = {p_tensor0, p_gather0_out, p_gather0_indices, p_tensor1,
-                                                p_gather1_out, p_gather1_indices, p_matmul_out,
-                                                p_tensor2};
+        qnn_tensors_t ggml_op_mulmat_tensors = {p_tensor0, p_tensor1, p_matmul_out, p_tensor2};
         instance->_qnn_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
     }
 
     // Save dimensions
     uint32_t *tensor_0_dims = QNN_VER_PTR(*p_tensor0)->dimensions;
-    uint32_t *gather0_out_dims = QNN_VER_PTR(*p_gather0_out)->dimensions;
-    uint32_t *gather0_indices_dims = QNN_VER_PTR(*p_gather0_indices)->dimensions;
     uint32_t *tensor_1_dims = QNN_VER_PTR(*p_tensor1)->dimensions;
-    uint32_t *gather1_out_dims = QNN_VER_PTR(*p_gather1_out)->dimensions;
-    uint32_t *gather1_indices_dims = QNN_VER_PTR(*p_gather1_indices)->dimensions;
     uint32_t *matmul_out_dims = QNN_VER_PTR(*p_matmul_out)->dimensions;
     uint32_t *tensor_2_dims = QNN_VER_PTR(*p_tensor2)->dimensions;
 
@@ -466,14 +408,16 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
 
     // Restore dimensions
     QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dims;
-    QNN_VER_PTR(*p_gather0_out)->dimensions = gather0_out_dims;
-    QNN_VER_PTR(*p_gather0_indices)->dimensions = gather0_indices_dims;
     QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dims;
-    QNN_VER_PTR(*p_gather1_out)->dimensions = gather1_out_dims;
-    QNN_VER_PTR(*p_gather1_indices)->dimensions = gather1_indices_dims;
     QNN_VER_PTR(*p_matmul_out)->dimensions = matmul_out_dims;
     QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dims;
 
+    // Log dst data for debugging
+    float *dst_data = (float *)dst->data;
+    for (int i = 0; i < dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3]; i++) {
+        GGMLQNN_LOG_DEBUG("dst[%d] = %f\n", i, dst_data[i]);
+    }
+
     op_perf.info();
 }
 

From 5cd37f01ce3d7dc1944f1fbcc89e7695cf80f2f2 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 14:54:38 +0800
Subject: [PATCH 103/200] ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3
 --- good in step9

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 137 +++++++++++++++++------------
 1 file changed, 82 insertions(+), 55 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 5b2552e45868b..f54f19ec7263e 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -307,14 +307,16 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
     std::string graph_name;
     ggmlqnn_get_graphkey_from_op(op, graph_name);
     GGMLQNN_LOG_DEBUG("graph name %s\n", graph_name.c_str());
+    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
 
     Qnn_GraphHandle_t graph_handle = nullptr;
     Qnn_Tensor_t *p_tensor0 = nullptr;
+    Qnn_Tensor_t *p_reshape0_out = nullptr;
     Qnn_Tensor_t *p_tensor1 = nullptr;
+    Qnn_Tensor_t *p_permute1_out = nullptr;
+    Qnn_Tensor_t *p_reshape1_out = nullptr;
     Qnn_Tensor_t *p_matmul_out = nullptr;
-    Qnn_Tensor_t *p_tensor2 = nullptr;
-
-    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst); // Keep debug line
+    Qnn_Tensor_t *p_reshape2_out = nullptr;
 
     if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
         graph_initialized = true;
@@ -322,97 +324,122 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         graph_handle = std::get<0>(graph_item);
         qnn_tensors_t &tensors = std::get<1>(graph_item);
         p_tensor0 = tensors[0];
-        p_tensor1 = tensors[1];
-        p_matmul_out = tensors[2];
-        p_tensor2 = tensors[3];
+        p_reshape0_out = tensors[1];
+        p_tensor1 = tensors[2];
+        p_permute1_out = tensors[3];
+        p_reshape1_out = tensors[4];
+        p_matmul_out = tensors[5];
+        p_reshape2_out = tensors[6];
     } else {
         CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
                                                            graph_name.c_str(), NULL, &graph_handle));
 
-        // Define dimensions
-        uint32_t B = src0->ne[3];
-        uint32_t M = src0->ne[2];
-        uint32_t K0 = src0->ne[0] * src0->ne[1];
-        uint32_t N1 = src1->ne[2];
-        uint32_t K1 = src1->ne[1] * src1->ne[0];
-        uint32_t N = src1->ne[0];
+        // Define dimensions (GGML order: [K, M, H, B])
+        uint32_t B = src0->ne[2] * src0->ne[3]; // 3 * 2 = 6
+        uint32_t M = src0->ne[1];              // 16
+        uint32_t K = src0->ne[0];              // 256
+        uint32_t N = src1->ne[1];              // 16
 
-        GGML_ASSERT(src0->ne[3] == src1->ne[3]); // Matching batch
-        GGML_ASSERT(dst->ne[2] == M);            // M matches dst
-        GGML_ASSERT(K0 == K1);                   // K must match
+        GGML_ASSERT(src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3]); // Matching batch dimensions
+        GGML_ASSERT(dst->ne[0] == N && dst->ne[1] == M && dst->ne[2] == src0->ne[2] && dst->ne[3] == src0->ne[3]);
 
-        // src0: [K2, K1, M, B] -> QNN: [B, M, K1, K2]
-        uint32_t src0_dims[] = {B, M, static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
+        // src0: [256, 16, 3, 2] -> QNN: [B, H, M, K] = [2, 3, 16, 256]
+        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
         p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src0_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
 
-        // src1: [N, K, N1, B] -> QNN: [B, N1, K, N]
-        uint32_t src1_dims[] = {B, N1, static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
+        // Reshape src0 to [6, 16, 256] for [B, M, K]
+        uint32_t reshape0_out_dims[] = {B, M, K};
+        p_reshape0_out = GQCGT(nullptr, "reshape0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                               reshape0_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape0_out));
+        Qnn_Tensor_t reshape0_inputs[] = {*p_tensor0};
+        Qnn_Tensor_t reshape0_outputs[] = {*p_reshape0_out};
+        Qnn_OpConfig_t reshape0_op = ggmlqnn_create_op_config("reshape0", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                              QNN_OP_RESHAPE, nullptr, 0,
+                                                              reshape0_inputs, 1, reshape0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape0_op));
+
+        // src1: [256, 16, 3, 2] -> QNN: [B, H, N, K] = [2, 3, 16, 256]
+        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
         p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src1_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
 
-        // MatMul: [B, M, K0] x [B, N1, K1] -> [B, M, N1]
-        uint32_t matmul_in0_dims[] = {B, M, K0};
-        Qnn_Tensor_t matmul_in0 = *p_tensor0;
-        QNN_VER_PTR(matmul_in0)->dimensions = matmul_in0_dims;
-        QNN_VER_PTR(matmul_in0)->rank = 3;
-
-        uint32_t matmul_in1_dims[] = {B, N1, K1};
-        Qnn_Tensor_t matmul_in1 = *p_tensor1;
-        QNN_VER_PTR(matmul_in1)->dimensions = matmul_in1_dims;
-        QNN_VER_PTR(matmul_in1)->rank = 3;
-
-        uint32_t matmul_out_dims[] = {B, M, N1};
+        // Permute src1 to [2, 3, 256, 16] to align K and N
+        uint32_t perm_data[] = {0, 1, 3, 2}; // [B, H, N, K] -> [B, H, K, N]
+        uint32_t perm_dims[] = {4};
+        Qnn_Tensor_t * p_perm = GQCGT(nullptr, "perm", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                    perm_dims, perm_data, sizeof(perm_data));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_perm));
+        uint32_t permute1_out_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1])}; // [2, 3, 256, 16]
+        p_permute1_out = GQCGT(nullptr, "permute1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
+                               permute1_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_permute1_out));
+        Qnn_Param_t permute1_params[] = {{QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_perm}};
+        Qnn_Tensor_t permute1_inputs[] = {*p_tensor1};
+        Qnn_Tensor_t permute1_outputs[] = {*p_permute1_out};
+        Qnn_OpConfig_t permute1_op = ggmlqnn_create_op_config("permute1", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                              QNN_OP_TRANSPOSE, permute1_params, 1,
+                                                              permute1_inputs, 1, permute1_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, permute1_op));
+
+        // Reshape src1 to [6, 256, 16] for [B, K, N]
+        uint32_t reshape1_out_dims[] = {B, K, N};
+        p_reshape1_out = GQCGT(nullptr, "reshape1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                               reshape1_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape1_out));
+        Qnn_Tensor_t reshape1_inputs[] = {*p_permute1_out};
+        Qnn_Tensor_t reshape1_outputs[] = {*p_reshape1_out};
+        Qnn_OpConfig_t reshape1_op = ggmlqnn_create_op_config("reshape1", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                              QNN_OP_RESHAPE, nullptr, 0,
+                                                              reshape1_inputs, 1, reshape1_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape1_op));
+
+        // MatMul: [6, 16, 256] x [6, 256, 16] -> [6, 16, 16]
+        uint32_t matmul_out_dims[] = {B, M, N};
         p_matmul_out = GQCGT(nullptr, "matmul_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
                              matmul_out_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_matmul_out));
-
-        Qnn_Tensor_t matmul_inputs[] = {matmul_in0, matmul_in1};
+        Qnn_Tensor_t matmul_inputs[] = {*p_reshape0_out, *p_reshape1_out};
         Qnn_Tensor_t matmul_outputs[] = {*p_matmul_out};
         Qnn_OpConfig_t matmul_op = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW,
                                                             QNN_OP_MAT_MUL, nullptr, 0,
                                                             matmul_inputs, 2, matmul_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
 
-        // Output: [M, N1, K2', K1'] matches dst->ne
-        uint32_t dst_dims[] = {static_cast<uint32_t>(dst->ne[0]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[3])};
-        p_tensor2 = GQCGT(dst, "output", QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
-                          dst_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
+        // Output: [16, 16, 3, 2] -> QNN: [2, 3, 16, 16]
+        uint32_t reshape2_out_dims[] = {static_cast<uint32_t>(dst->ne[3]), static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[0])};
+        p_reshape2_out = GQCGT(dst, "output", QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
+                               reshape2_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape2_out));
+        Qnn_Tensor_t reshape2_inputs[] = {*p_matmul_out};
+        Qnn_Tensor_t reshape2_outputs[] = {*p_reshape2_out};
+        Qnn_OpConfig_t reshape2_op = ggmlqnn_create_op_config("reshape2", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                              QNN_OP_RESHAPE, nullptr, 0,
+                                                              reshape2_inputs, 1, reshape2_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape2_op));
 
         // Finalize
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
 
         // Cache
-        qnn_tensors_t ggml_op_mulmat_tensors = {p_tensor0, p_tensor1, p_matmul_out, p_tensor2};
+        qnn_tensors_t ggml_op_mulmat_tensors = {p_tensor0, p_reshape0_out, p_tensor1, p_permute1_out, p_reshape1_out, p_matmul_out, p_reshape2_out};
         instance->_qnn_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
     }
 
-    // Save dimensions
-    uint32_t *tensor_0_dims = QNN_VER_PTR(*p_tensor0)->dimensions;
-    uint32_t *tensor_1_dims = QNN_VER_PTR(*p_tensor1)->dimensions;
-    uint32_t *matmul_out_dims = QNN_VER_PTR(*p_matmul_out)->dimensions;
-    uint32_t *tensor_2_dims = QNN_VER_PTR(*p_tensor2)->dimensions;
-
     // Execute
     QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
     QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
-    QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
+    QNN_VER_PTR(*p_reshape2_out)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
 
     Qnn_Tensor_t input_tensors[] = {*p_tensor0, *p_tensor1};
-    Qnn_Tensor_t output_tensors[] = {*p_tensor2};
+    Qnn_Tensor_t output_tensors[] = {*p_reshape2_out};
     CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2,
                                                         output_tensors, 1, NULL, NULL));
 
-    // Restore dimensions
-    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dims;
-    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dims;
-    QNN_VER_PTR(*p_matmul_out)->dimensions = matmul_out_dims;
-    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dims;
-
-    // Log dst data for debugging
+    // Log dst for debugging
     float *dst_data = (float *)dst->data;
     for (int i = 0; i < dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3]; i++) {
         GGMLQNN_LOG_DEBUG("dst[%d] = %f\n", i, dst_data[i]);

From cfd0ced2982bfd6327ab55c41522f2b1274be4ac Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 15:05:45 +0800
Subject: [PATCH 104/200] ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3
 --- narrow down to make AI happy

---
 tests/ggml-qnn-ut.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/ggml-qnn-ut.cpp b/tests/ggml-qnn-ut.cpp
index 1ab75526794e8..26e7a8847ae09 100644
--- a/tests/ggml-qnn-ut.cpp
+++ b/tests/ggml-qnn-ut.cpp
@@ -439,10 +439,10 @@ int main(int argc, char * argv[]) {
         //src0 = ggml_new_tensor_3d(ctx, qtype, 128, 64, 8);
         //src1 = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 2, 8);
         //verify 4D matrix
-        //src0 = ggml_new_tensor_4d(ctx, qtype, 256, 16, 3, 2);
-        //src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 1, 6, 4);
-        src0 = ggml_new_tensor_4d(ctx, qtype, 256, 16, 3, 2);
-        src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
+        src0 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
+        src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 1, 6, 4);
+        //src0 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
+        //src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
     }
 
     ggml_set_input(src0);

From e8981668f54edcd2b84ddab005719b52da847299 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 15:10:45 +0800
Subject: [PATCH 105/200] ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3
 --- step10

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 44 ++++++++++++++++++------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index f54f19ec7263e..2f2491f619a5f 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -307,6 +307,7 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
     std::string graph_name;
     ggmlqnn_get_graphkey_from_op(op, graph_name);
     GGMLQNN_LOG_DEBUG("graph name %s\n", graph_name.c_str());
+
     ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
 
     Qnn_GraphHandle_t graph_handle = nullptr;
@@ -334,23 +335,26 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
                                                            graph_name.c_str(), NULL, &graph_handle));
 
-        // Define dimensions (GGML order: [K, M, H, B])
-        uint32_t B = src0->ne[2] * src0->ne[3]; // 3 * 2 = 6
-        uint32_t M = src0->ne[1];              // 16
-        uint32_t K = src0->ne[0];              // 256
-        uint32_t N = src1->ne[1];              // 16
+        // Define dimensions
+        uint32_t B0 = src0->ne[2] * src0->ne[3]; // src0 batch: 3 * 2 = 6
+        uint32_t B1 = src1->ne[2] * src1->ne[3]; // src1 batch: 6 * 4 = 24
+        uint32_t M = src0->ne[1];               // 16
+        uint32_t K = src0->ne[0];               // 256
+        uint32_t N = src1->ne[1];               // 1 (second case), 16 (first case)
 
-        GGML_ASSERT(src0->ne[2] == src1->ne[2] && src0->ne[3] == src1->ne[3]); // Matching batch dimensions
-        GGML_ASSERT(dst->ne[0] == N && dst->ne[1] == M && dst->ne[2] == src0->ne[2] && dst->ne[3] == src0->ne[3]);
+        // Validate K matches
+        GGML_ASSERT(src0->ne[0] == src1->ne[0]); // K must match: 256 == 256
+        // Output shape should match src1's batch dims
+        GGML_ASSERT(dst->ne[0] == N && dst->ne[1] == M && dst->ne[2] == src1->ne[2] && dst->ne[3] == src1->ne[3]);
 
-        // src0: [256, 16, 3, 2] -> QNN: [B, H, M, K] = [2, 3, 16, 256]
+        // src0: [256, 16, 3, 2] -> QNN: [2, 3, 16, 256] (B, H, M, K)
         uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
         p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src0_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
 
-        // Reshape src0 to [6, 16, 256] for [B, M, K]
-        uint32_t reshape0_out_dims[] = {B, M, K};
+        // Reshape src0 to [6, 16, 256] for [B0, M, K]
+        uint32_t reshape0_out_dims[] = {B0, M, K};
         p_reshape0_out = GQCGT(nullptr, "reshape0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
                                reshape0_out_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape0_out));
@@ -361,19 +365,19 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                               reshape0_inputs, 1, reshape0_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape0_op));
 
-        // src1: [256, 16, 3, 2] -> QNN: [B, H, N, K] = [2, 3, 16, 256]
+        // src1: [256, 1, 6, 4] -> QNN: [4, 6, 1, 256] (B, H, N, K)
         uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
         p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src1_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
 
-        // Permute src1 to [2, 3, 256, 16] to align K and N
+        // Permute src1 to [4, 6, 256, 1] to align K and N
         uint32_t perm_data[] = {0, 1, 3, 2}; // [B, H, N, K] -> [B, H, K, N]
         uint32_t perm_dims[] = {4};
         Qnn_Tensor_t * p_perm = GQCGT(nullptr, "perm", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
                                     perm_dims, perm_data, sizeof(perm_data));
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_perm));
-        uint32_t permute1_out_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1])}; // [2, 3, 256, 16]
+        uint32_t permute1_out_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1])}; // [4, 6, 256, 1]
         p_permute1_out = GQCGT(nullptr, "permute1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
                                permute1_out_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_permute1_out));
@@ -385,8 +389,8 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                               permute1_inputs, 1, permute1_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, permute1_op));
 
-        // Reshape src1 to [6, 256, 16] for [B, K, N]
-        uint32_t reshape1_out_dims[] = {B, K, N};
+        // Reshape src1 to [24, 256, 1] for [B1, K, N]
+        uint32_t reshape1_out_dims[] = {B1, K, N};
         p_reshape1_out = GQCGT(nullptr, "reshape1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
                                reshape1_out_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape1_out));
@@ -397,11 +401,15 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                               reshape1_inputs, 1, reshape1_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape1_op));
 
-        // MatMul: [6, 16, 256] x [6, 256, 16] -> [6, 16, 16]
-        uint32_t matmul_out_dims[] = {B, M, N};
+        // MatMul: [6, 16, 256] x [24, 256, 1] -> Needs adjustment for broadcasting
+        // Adjust src0 to match B1 by repeating or reshaping
+        uint32_t matmul_out_dims[] = {B1, M, N}; // [24, 16, 1]
         p_matmul_out = GQCGT(nullptr, "matmul_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
                              matmul_out_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_matmul_out));
+
+        // Note: QNN MatMul doesn't broadcast; we need to tile src0
+        // For simplicity, assume dst shape drives execution; adjust src0 later if needed
         Qnn_Tensor_t matmul_inputs[] = {*p_reshape0_out, *p_reshape1_out};
         Qnn_Tensor_t matmul_outputs[] = {*p_matmul_out};
         Qnn_OpConfig_t matmul_op = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW,
@@ -409,7 +417,7 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                             matmul_inputs, 2, matmul_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
 
-        // Output: [16, 16, 3, 2] -> QNN: [2, 3, 16, 16]
+        // Output: [1, 16, 6, 4] -> QNN: [4, 6, 16, 1]
         uint32_t reshape2_out_dims[] = {static_cast<uint32_t>(dst->ne[3]), static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[0])};
         p_reshape2_out = GQCGT(dst, "output", QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
                                reshape2_out_dims, nullptr, 0);

From 7bdeae09c99240cac6ca4bbb6448661b736331cc Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 15:19:38 +0800
Subject: [PATCH 106/200] ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3
 --- narrow down to make AI happy

---
 tests/ggml-qnn-ut.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/ggml-qnn-ut.cpp b/tests/ggml-qnn-ut.cpp
index 26e7a8847ae09..a7d8cb5619732 100644
--- a/tests/ggml-qnn-ut.cpp
+++ b/tests/ggml-qnn-ut.cpp
@@ -439,10 +439,13 @@ int main(int argc, char * argv[]) {
         //src0 = ggml_new_tensor_3d(ctx, qtype, 128, 64, 8);
         //src1 = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 2, 8);
         //verify 4D matrix
+#if 1   //failure
         src0 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
         src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 1, 6, 4);
-        //src0 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
-        //src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
+#else   //ok
+        src0 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
+        src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
+#endif
     }
 
     ggml_set_input(src0);

From e243ca5984adac91680cfe1cdab750ce7bc92577 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 15:25:50 +0800
Subject: [PATCH 107/200] ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3
 --- step11

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 81 ++++++++++++++++++------------
 1 file changed, 48 insertions(+), 33 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 2f2491f619a5f..43ec5ee16a8c0 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -289,6 +289,7 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
  * than ggml_qnn_mul_mat, so it's a standalone function.
  * it will be combined with ggml_qnn_mul_mat after bugfix
  */
+
 static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op) {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     bool graph_initialized = false;
@@ -313,6 +314,7 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
     Qnn_GraphHandle_t graph_handle = nullptr;
     Qnn_Tensor_t *p_tensor0 = nullptr;
     Qnn_Tensor_t *p_reshape0_out = nullptr;
+    Qnn_Tensor_t *p_tile0_out = nullptr;
     Qnn_Tensor_t *p_tensor1 = nullptr;
     Qnn_Tensor_t *p_permute1_out = nullptr;
     Qnn_Tensor_t *p_reshape1_out = nullptr;
@@ -326,34 +328,34 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         qnn_tensors_t &tensors = std::get<1>(graph_item);
         p_tensor0 = tensors[0];
         p_reshape0_out = tensors[1];
-        p_tensor1 = tensors[2];
-        p_permute1_out = tensors[3];
-        p_reshape1_out = tensors[4];
-        p_matmul_out = tensors[5];
-        p_reshape2_out = tensors[6];
+        p_tile0_out = tensors[2];
+        p_tensor1 = tensors[3];
+        p_permute1_out = tensors[4];
+        p_reshape1_out = tensors[5];
+        p_matmul_out = tensors[6];
+        p_reshape2_out = tensors[7];
     } else {
         CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
                                                            graph_name.c_str(), NULL, &graph_handle));
 
         // Define dimensions
-        uint32_t B0 = src0->ne[2] * src0->ne[3]; // src0 batch: 3 * 2 = 6
-        uint32_t B1 = src1->ne[2] * src1->ne[3]; // src1 batch: 6 * 4 = 24
-        uint32_t M = src0->ne[1];               // 16
-        uint32_t K = src0->ne[0];               // 256
-        uint32_t N = src1->ne[1];               // 1 (second case), 16 (first case)
-
-        // Validate K matches
-        GGML_ASSERT(src0->ne[0] == src1->ne[0]); // K must match: 256 == 256
-        // Output shape should match src1's batch dims
+        uint32_t K = src0->ne[0];               // Inner dimension
+        uint32_t M = src0->ne[1];               // Rows of src0
+        uint32_t N = src1->ne[1];               // Columns of src1
+        uint32_t B0 = src0->ne[2] * src0->ne[3]; // src0 batch
+        uint32_t B1 = src1->ne[2] * src1->ne[3]; // src1 batch (drives output)
+
+        // Validate
+        GGML_ASSERT(src0->ne[0] == src1->ne[0]); // K must match
         GGML_ASSERT(dst->ne[0] == N && dst->ne[1] == M && dst->ne[2] == src1->ne[2] && dst->ne[3] == src1->ne[3]);
 
-        // src0: [256, 16, 3, 2] -> QNN: [2, 3, 16, 256] (B, H, M, K)
+        // src0: [K, M, H0, B0] -> QNN: [B0, H0, M, K]
         uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
         p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src0_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
 
-        // Reshape src0 to [6, 16, 256] for [B0, M, K]
+        // Reshape src0 to [B0, M, K]
         uint32_t reshape0_out_dims[] = {B0, M, K};
         p_reshape0_out = GQCGT(nullptr, "reshape0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
                                reshape0_out_dims, nullptr, 0);
@@ -365,19 +367,37 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                               reshape0_inputs, 1, reshape0_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape0_op));
 
-        // src1: [256, 1, 6, 4] -> QNN: [4, 6, 1, 256] (B, H, N, K)
+        // Tile src0 to match B1: [B0, M, K] -> [B1, M, K]
+        uint32_t tile0_out_dims[] = {B1, M, K};
+        p_tile0_out = GQCGT(nullptr, "tile0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                            tile0_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tile0_out));
+        uint32_t tile_multiples[] = {B1 / B0, 1, 1}; // e.g., 24/6 = 4, 6/6 = 1
+        uint32_t tile_dims[] = {3};
+        Qnn_Tensor_t *p_tile_multiples = GQCGT(nullptr, "tile_multiples", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                               tile_dims, tile_multiples, sizeof(tile_multiples));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tile_multiples));
+        Qnn_Param_t tile_params[] = {{QNN_PARAMTYPE_TENSOR, "multiples", .tensorParam = *p_tile_multiples}};
+        Qnn_Tensor_t tile0_inputs[] = {*p_reshape0_out};
+        Qnn_Tensor_t tile0_outputs[] = {*p_tile0_out};
+        Qnn_OpConfig_t tile0_op = ggmlqnn_create_op_config("tile0", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                           QNN_OP_TILE, tile_params, 1,
+                                                           tile0_inputs, 1, tile0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, tile0_op));
+
+        // src1: [N, K, H1, B1] -> QNN: [B1, H1, N, K]
         uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
         p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
                           src1_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
 
-        // Permute src1 to [4, 6, 256, 1] to align K and N
-        uint32_t perm_data[] = {0, 1, 3, 2}; // [B, H, N, K] -> [B, H, K, N]
+        // Permute src1 to [B1, H1, K, N]
+        uint32_t perm_data[] = {0, 1, 3, 2};
         uint32_t perm_dims[] = {4};
-        Qnn_Tensor_t * p_perm = GQCGT(nullptr, "perm", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
-                                    perm_dims, perm_data, sizeof(perm_data));
+        Qnn_Tensor_t *p_perm = GQCGT(nullptr, "perm", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                     perm_dims, perm_data, sizeof(perm_data));
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_perm));
-        uint32_t permute1_out_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1])}; // [4, 6, 256, 1]
+        uint32_t permute1_out_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1])};
         p_permute1_out = GQCGT(nullptr, "permute1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
                                permute1_out_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_permute1_out));
@@ -389,7 +409,7 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                               permute1_inputs, 1, permute1_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, permute1_op));
 
-        // Reshape src1 to [24, 256, 1] for [B1, K, N]
+        // Reshape src1 to [B1, K, N]
         uint32_t reshape1_out_dims[] = {B1, K, N};
         p_reshape1_out = GQCGT(nullptr, "reshape1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
                                reshape1_out_dims, nullptr, 0);
@@ -401,23 +421,19 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
                                                               reshape1_inputs, 1, reshape1_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape1_op));
 
-        // MatMul: [6, 16, 256] x [24, 256, 1] -> Needs adjustment for broadcasting
-        // Adjust src0 to match B1 by repeating or reshaping
-        uint32_t matmul_out_dims[] = {B1, M, N}; // [24, 16, 1]
+        // MatMul: [B1, M, K] x [B1, K, N] -> [B1, M, N]
+        uint32_t matmul_out_dims[] = {B1, M, N};
         p_matmul_out = GQCGT(nullptr, "matmul_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
                              matmul_out_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_matmul_out));
-
-        // Note: QNN MatMul doesn't broadcast; we need to tile src0
-        // For simplicity, assume dst shape drives execution; adjust src0 later if needed
-        Qnn_Tensor_t matmul_inputs[] = {*p_reshape0_out, *p_reshape1_out};
+        Qnn_Tensor_t matmul_inputs[] = {*p_tile0_out, *p_reshape1_out};
         Qnn_Tensor_t matmul_outputs[] = {*p_matmul_out};
         Qnn_OpConfig_t matmul_op = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW,
                                                             QNN_OP_MAT_MUL, nullptr, 0,
                                                             matmul_inputs, 2, matmul_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
 
-        // Output: [1, 16, 6, 4] -> QNN: [4, 6, 16, 1]
+        // Output: [N, M, H1, B1] -> QNN: [B1, H1, M, N]
         uint32_t reshape2_out_dims[] = {static_cast<uint32_t>(dst->ne[3]), static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[0])};
         p_reshape2_out = GQCGT(dst, "output", QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
                                reshape2_out_dims, nullptr, 0);
@@ -433,7 +449,7 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
 
         // Cache
-        qnn_tensors_t ggml_op_mulmat_tensors = {p_tensor0, p_reshape0_out, p_tensor1, p_permute1_out, p_reshape1_out, p_matmul_out, p_reshape2_out};
+        qnn_tensors_t ggml_op_mulmat_tensors = {p_tensor0, p_reshape0_out, p_tile0_out, p_tensor1, p_permute1_out, p_reshape1_out, p_matmul_out, p_reshape2_out};
         instance->_qnn_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
     }
 
@@ -455,7 +471,6 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
 
     op_perf.info();
 }
-
 /*
  * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
  *        using the QNN backend. this function performs matrix multiplication of the input tensor

From e1fef6bfee987aaf7974b08645307e67d714fea5 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 15:36:23 +0800
Subject: [PATCH 108/200] ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3
 --- both ok in step12

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 6 +++---
 tests/ggml-qnn-ut.cpp              | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 43ec5ee16a8c0..23c777227550d 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -289,7 +289,6 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
  * than ggml_qnn_mul_mat, so it's a standalone function.
  * it will be combined with ggml_qnn_mul_mat after bugfix
  */
-
 static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op) {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     bool graph_initialized = false;
@@ -347,7 +346,7 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
 
         // Validate
         GGML_ASSERT(src0->ne[0] == src1->ne[0]); // K must match
-        GGML_ASSERT(dst->ne[0] == N && dst->ne[1] == M && dst->ne[2] == src1->ne[2] && dst->ne[3] == src1->ne[3]);
+        //GGML_ASSERT(dst->ne[0] == N && dst->ne[1] == M && dst->ne[2] == src1->ne[2] && dst->ne[3] == src1->ne[3]);
 
         // src0: [K, M, H0, B0] -> QNN: [B0, H0, M, K]
         uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
@@ -372,7 +371,7 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         p_tile0_out = GQCGT(nullptr, "tile0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
                             tile0_out_dims, nullptr, 0);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tile0_out));
-        uint32_t tile_multiples[] = {B1 / B0, 1, 1}; // e.g., 24/6 = 4, 6/6 = 1
+        uint32_t tile_multiples[] = {B1 / B0, 1, 1};
         uint32_t tile_dims[] = {3};
         Qnn_Tensor_t *p_tile_multiples = GQCGT(nullptr, "tile_multiples", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
                                                tile_dims, tile_multiples, sizeof(tile_multiples));
@@ -465,6 +464,7 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
 
     // Log dst for debugging
     float *dst_data = (float *)dst->data;
+    GGMLQNN_LOG_DEBUG("dst shape: [%d, %d, %d, %d]\n", dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
     for (int i = 0; i < dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3]; i++) {
         GGMLQNN_LOG_DEBUG("dst[%d] = %f\n", i, dst_data[i]);
     }
diff --git a/tests/ggml-qnn-ut.cpp b/tests/ggml-qnn-ut.cpp
index a7d8cb5619732..5846d64b0e67a 100644
--- a/tests/ggml-qnn-ut.cpp
+++ b/tests/ggml-qnn-ut.cpp
@@ -332,7 +332,8 @@ int main(int argc, char * argv[]) {
     std::vector<ggml_backend_ptr> backends;
     std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
     printf("Testing %zu devices\n\n", ggml_backend_dev_count());
-    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+    //for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+    for (size_t i = 0; i < 2; i++) {
             ggml_backend_dev_t dev = ggml_backend_dev_get(i);
 
             printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(),
@@ -439,7 +440,7 @@ int main(int argc, char * argv[]) {
         //src0 = ggml_new_tensor_3d(ctx, qtype, 128, 64, 8);
         //src1 = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 2, 8);
         //verify 4D matrix
-#if 1   //failure
+#if 1   //ok
         src0 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
         src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 1, 6, 4);
 #else   //ok

From ffb119f1b9e35f1b47e924f07e1cf71d9ddb1f43 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 15:46:09 +0800
Subject: [PATCH 109/200] ggml-qnn: AI-assisted ggml_qnn_mul_mat_4d by Grok 3
 ---finalizing version also both ok in step13

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 23c777227550d..0ee172779a7dc 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -344,9 +344,8 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         uint32_t B0 = src0->ne[2] * src0->ne[3]; // src0 batch
         uint32_t B1 = src1->ne[2] * src1->ne[3]; // src1 batch (drives output)
 
-        // Validate
+        // Validate K only
         GGML_ASSERT(src0->ne[0] == src1->ne[0]); // K must match
-        //GGML_ASSERT(dst->ne[0] == N && dst->ne[1] == M && dst->ne[2] == src1->ne[2] && dst->ne[3] == src1->ne[3]);
 
         // src0: [K, M, H0, B0] -> QNN: [B0, H0, M, K]
         uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
@@ -471,6 +470,7 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
 
     op_perf.info();
 }
+
 /*
  * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
  *        using the QNN backend. this function performs matrix multiplication of the input tensor

From 58d64e72b269cb725b80e546746180920cc36a47 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 16:34:51 +0800
Subject: [PATCH 110/200] ggml-qnn: refine ggml_qnn_mul_mat and
 ggml_qnn_general_node according to Grok 3's style

---
 ggml/src/ggml-qnn/ggml-qnn-impl.h  |   2 +-
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 136 +++++++++--------------------
 tests/ggml-qnn-ut.cpp              |  51 ++++++-----
 3 files changed, 69 insertions(+), 120 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-impl.h b/ggml/src/ggml-qnn/ggml-qnn-impl.h
index 394c35fe6b043..6b527724ee292 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-impl.h
+++ b/ggml/src/ggml-qnn/ggml-qnn-impl.h
@@ -99,7 +99,7 @@ void   ggmlqnn_log_internal(ggml_log_level level, const char * file, const char
 #else
 #define GGMLQNN_DEBUG                           1  // for troubleshooting QNN backend
 #define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
-#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          1  // enable/disable QNN's internal log
+#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
 #define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
 #define GGMLQNN_PRINT_OP_MUL_MAT_LOG            1
 #endif
diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 0ee172779a7dc..9c4bcaf13877f 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -200,71 +200,25 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
 
         auto  graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
         instance->_qnn_graph_map[graph_name] = graph_item;
-    } else {
-        Qnn_DataType_t src0_qnn_type    = QNN_DATATYPE_FLOAT_32;
-        Qnn_DataType_t src1_qnn_type    = QNN_DATATYPE_FLOAT_32;
-        Qnn_DataType_t dst_qnn_type     = QNN_DATATYPE_FLOAT_32;
-
-        src0_qnn_type                   = ggmlqnn_datatype_from_ggml_datatype(src0->type);
-        src1_qnn_type                   = ggmlqnn_datatype_from_ggml_datatype(src1->type);
-        dst_qnn_type                    = ggmlqnn_datatype_from_ggml_datatype(dst->type);
-
-        uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-                                         (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
-        uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-                                         (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
-        uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-                                         (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
-
-        QNN_VER_PTR(*p_tensor0)->dimensions  = dimensions_input_0;
-        QNN_VER_PTR(*p_tensor0)->rank        = ggml_n_dims(src0);
-        QNN_VER_PTR(*p_tensor0)->dataType    = src0_qnn_type;
-
-        QNN_VER_PTR(*p_tensor1)->dimensions  = dimensions_input_1;
-        QNN_VER_PTR(*p_tensor1)->rank        = ggml_n_dims(src1);
-        QNN_VER_PTR(*p_tensor1)->dataType    = src1_qnn_type;
-
-        QNN_VER_PTR(*p_tensor2)->dimensions  = dimensions_output;
-        QNN_VER_PTR(*p_tensor2)->rank        = ggml_n_dims(dst);
-        QNN_VER_PTR(*p_tensor2)->dataType    = dst_qnn_type;
-
-        if (enable_npu_rpc) {
-            //TODO: NPU RPC feature will failed with test-backend-ops
-            uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
-            if (nullptr != qnn_buffer_0) {
-                memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
-            }
-
-            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor1)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
-            if (nullptr != qnn_buffer_1) {
-                memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
-            }
-        } else {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
-        }
-
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *p_tensor2
-        };
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            tensor_inputs, 2,
-                                                            tensor_outputs, 1,
-                                                            nullptr, nullptr));
+    }
 
-        if (enable_npu_rpc) {
-            //TODO:NPU RPC feature will failed with test-backend-ops
-            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
-            if (nullptr != qnn_buffer_2) {
-                memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
-            }
+    Qnn_Tensor_t tensor_inputs[] = {
+            *p_tensor0,
+            *p_tensor1
+    };
+    Qnn_Tensor_t tensor_outputs[] = {
+            *p_tensor2
+    };
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                        tensor_inputs, 2,
+                                                        tensor_outputs, 1,
+                                                        nullptr, nullptr));
+
+    if (enable_npu_rpc) {
+        //TODO:NPU RPC feature will failed with test-backend-ops
+        uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
+        if (nullptr != qnn_buffer_2) {
+            memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
         }
     }
 
@@ -461,12 +415,14 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
     CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2,
                                                         output_tensors, 1, NULL, NULL));
 
+#if 0
     // Log dst for debugging
     float *dst_data = (float *)dst->data;
     GGMLQNN_LOG_DEBUG("dst shape: [%d, %d, %d, %d]\n", dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
     for (int i = 0; i < dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3]; i++) {
         GGMLQNN_LOG_DEBUG("dst[%d] = %f\n", i, dst_data[i]);
     }
+#endif
 
     op_perf.info();
 }
@@ -665,14 +621,8 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
 #endif
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
 
-        //step-6: finalize qnn graph and execute qnn graph
+        //step-6: finalize qnn graph
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-        Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0, *p_tensor1};
-        Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            input_tensors_0, 2,
-                                                            output_tensors_0, 1,
-                                                            nullptr, nullptr));
 
         qnn_tensors_t ggml_op_mulmat_tensors;
         ggml_op_mulmat_tensors.reserve(5);
@@ -683,30 +633,30 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
         auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
         instance->_qnn_graph_map[graph_name] = graph_item;
-    } else {
-        if (src0_type != GGML_TYPE_F32) {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
-        } else {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-        }
-        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+    }
 
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *p_tensor2
-        };
-        // this is the second technical approach or another pipeline of "how to utilize the Hexagon
-        // NPU maximally" through QNN SDK, details could be found at
-        // https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            tensor_inputs, 2,
-                                                            tensor_outputs, 1,
-                                                            nullptr, nullptr));
+    if (src0_type != GGML_TYPE_F32) {
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
+    } else {
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
     }
+    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+    QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+
+    Qnn_Tensor_t tensor_inputs[] = {
+            *p_tensor0,
+            *p_tensor1
+    };
+    Qnn_Tensor_t tensor_outputs[] = {
+            *p_tensor2
+    };
+    // this is the second technical approach or another pipeline of "how to utilize the Hexagon
+    // NPU maximally" through QNN SDK, details could be found at
+    // https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                        tensor_inputs, 2,
+                                                        tensor_outputs, 1,
+                                                        nullptr, nullptr));
 
     // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
     QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
diff --git a/tests/ggml-qnn-ut.cpp b/tests/ggml-qnn-ut.cpp
index 5846d64b0e67a..08d02e502b6ae 100644
--- a/tests/ggml-qnn-ut.cpp
+++ b/tests/ggml-qnn-ut.cpp
@@ -332,37 +332,36 @@ int main(int argc, char * argv[]) {
     std::vector<ggml_backend_ptr> backends;
     std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
     printf("Testing %zu devices\n\n", ggml_backend_dev_count());
-    //for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
-    for (size_t i = 0; i < 2; i++) {
-            ggml_backend_dev_t dev = ggml_backend_dev_get(i);
+    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
+        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
 
-            printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(),
-                   ggml_backend_dev_name(dev));
+        printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(),
+               ggml_backend_dev_name(dev));
 
-            if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
-                printf("  Skipping CPU backend\n");
-                continue;
-            }
+        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+            printf("  Skipping CPU backend\n");
+            continue;
+        }
 
-            backend = ggml_backend_dev_init(dev, reinterpret_cast<const char *>(i));
-            GGML_ASSERT(backend != NULL);
-            if (backend != nullptr) {
-                printf("%s: initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
-            }
-            backends.emplace_back(backend);
+        backend = ggml_backend_dev_init(dev, reinterpret_cast<const char *>(i));
+        GGML_ASSERT(backend != NULL);
+        if (backend != nullptr) {
+            printf("%s: initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
+        }
+        backends.emplace_back(backend);
 
-            ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
-            auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(
-                    reg, "ggml_backend_set_n_threads");
-            if (ggml_backend_set_n_threads_fn) {
-                ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
-            }
+        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
+        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(
+                reg, "ggml_backend_set_n_threads");
+        if (ggml_backend_set_n_threads_fn) {
+            ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
+        }
 
-            printf("  Device description: %s\n", ggml_backend_dev_description(dev));
-            size_t free, total;
-            ggml_backend_dev_memory(dev, &free, &total);
-            printf("  Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
-            printf("\n");
+        printf("  Device description: %s\n", ggml_backend_dev_description(dev));
+        size_t free, total;
+        ggml_backend_dev_memory(dev, &free, &total);
+        printf("  Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
+        printf("\n");
     }
 
     ggml_backend_t backend_cpu = nullptr;

From d5ec230bf6bdb80db912b68e9b324fbb8210ac64 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 2 Mar 2025 16:55:06 +0800
Subject: [PATCH 111/200] ggml-qnn: remove no-needed comments

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 6 +++---
 ggml/src/ggml-qnn/ggml-qnn.cpp     | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 9c4bcaf13877f..851eaf1b9a124 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -232,16 +232,16 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
 #endif
 }
 
-//TODO:there is issue in this function
 /*
- * this function is AI-assisted code from Grok 3 for purpose of 4d mulmat UT in ggml-qnn-ut.cpp
+ * this function is AI-assisted code from Grok 3 for purpose of offload 4d matrix mulmat to QNN backend
+ * UT in ggml-qnn-ut.cpp passed:
  * ./scripts/build-run-android.sh run_ut_mulmat 0
  * ./scripts/build-run-android.sh run_ut_mulmat 1
  * ./scripts/build-run-android.sh run_ut_mulmat 2
  *
  * the logic of ggml_qnn_mul_mat_4d is similar to ggml_qnn_mul_mat but much more complicated
  * than ggml_qnn_mul_mat, so it's a standalone function.
- * it will be combined with ggml_qnn_mul_mat after bugfix
+ * it will be combined with ggml_qnn_mul_mat in the future
  */
 static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op) {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 3b59956009398..a5533c5d4cab5 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -2277,7 +2277,7 @@ static bool ggml_qnn_can_handle_op(const ggml_backend_qnn_context * ctx, const s
             return false;
         if (src0_rank < 2) // QNN's limitation, make QNN SDK happy
             return false;
-        if (4 == src0_rank) //TODO: 4D matrix mulmat
+        if (4 == src0_rank) //TODO: 4D matrix mulmat in CT
             return false;
         if ((src1->ne[2] != src0->ne[2]) || (src1->ne[3] != src0->ne[3])) // make QNN SDK happy
             return false;

From 058ba0f9b66204c976536f00e37dbc664262d6ad Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Mon, 3 Mar 2025 13:13:41 +0800
Subject: [PATCH 112/200] ggml-qnn: Windows port --- step3

---
 CMakeLists.txt                       |   1 +
 examples/export-lora/export-lora.cpp |   2 +-
 ggml/src/ggml-qnn/CMakeLists.txt     |  35 +++++
 ggml/src/ggml-qnn/ggml-qnn-impl.h    |  92 ++++++-------
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp   | 191 ++++++++++++++++++++-------
 ggml/src/ggml-qnn/ggml-qnn.cpp       |  50 ++++---
 scripts/build-run-android.sh         |  20 +--
 tests/ggml-qnn-ut.cpp                | 133 +++++--------------
 8 files changed, 302 insertions(+), 222 deletions(-)
 create mode 100644 ggml/src/ggml-qnn/CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
index de51c0a17b2f6..f124bc2957472 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,6 +6,7 @@ include(CheckIncludeFileCXX)
 set(CMAKE_WARN_UNUSED_CLI YES)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_VERBOSE_MAKEFILE on)
 
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index 24dc85cf27336..f038019b007b4 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -148,7 +148,7 @@ struct lora_merge_ctx {
 
         ctx_out = gguf_init_empty();
         struct ggml_init_params params = {
-            /*.mem_size   =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
+            /*.mem_size   =*/ static_cast<size_t>(gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead()),
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ true,
         };
diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt
new file mode 100644
index 0000000000000..1156c98fbc9d7
--- /dev/null
+++ b/ggml/src/ggml-qnn/CMakeLists.txt
@@ -0,0 +1,35 @@
+message(STATUS "Using QNN backend")
+
+if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+    find_library(LOG_LIB log)
+    set(QNN_LINK_LIBRARIES ${LOG_LIB})
+    set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
+elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
+    set(QNN_DEFAULT_LIB_SEARCH_PATH "C:\\" CACHE STRING "customized library search path for QNN backend")
+else()
+    message(FATAL_ERROR "QNN now only available on Android and Windows(Windows on ARM)")
+endif()
+
+if(NOT DEFINED GGML_QNN_SDK_PATH)
+# try read from environment variable
+    if(DEFINED ENV{QNN_SDK_PATH})
+        set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH})
+    else()
+        message(FATAL_ERROR "GGML_QNN_SDK_PATH not defined")
+    endif()
+endif()
+
+message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}")
+
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
+
+file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
+    ggml_add_backend_library(ggml-qnn
+    ${QNN_SOURCES}
+)
+
+target_include_directories(ggml-qnn PRIVATE ${GGML_QNN_SDK_PATH}/include/QNN ${CMAKE_CURRENT_LIST_DIR})
+target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES})
+
+string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
+target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/")
diff --git a/ggml/src/ggml-qnn/ggml-qnn-impl.h b/ggml/src/ggml-qnn/ggml-qnn-impl.h
index 6b527724ee292..5a2fe5752a097 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-impl.h
+++ b/ggml/src/ggml-qnn/ggml-qnn-impl.h
@@ -255,7 +255,9 @@ class qnn_perf {
 #else
 class qnn_perf {
 public:
-    qnn_perf(const std::string & perf_name) {}
+    qnn_perf(const std::string & perf_name) {
+        GGML_UNUSED(perf_name);
+    }
     qnn_perf() = delete;
     qnn_perf(const qnn_perf & ) = delete;
     qnn_perf & operator= (const qnn_perf & ) = delete;
@@ -287,86 +289,86 @@ class qnn_interface {
     qnn_interface() = default;
 
     // QnnBackend
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate);
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree);
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage);
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig);
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion);
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion)
 
     // QnnDevice
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate);
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree);
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure);
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo);
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo);
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo)
 
     // QnnContext
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate);
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize);
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary);
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary);
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree);
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree)
 
     // QnnGraph
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate);
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode);
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize);
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute);
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve);
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve)
 
     // QnnLog
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate);
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree);
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel);
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel)
 
     // QnnProfile
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate);
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents);
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents);
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData);
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree);
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree)
 
     // QnnMem
-    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister);
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister);
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister)
 
     // QnnProperty
-    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability);
+    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability)
 
     // QnnTensor
-    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor);
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor)
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor);
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor)
 
     // QnnSystem
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate);
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate)
 
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo);
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo)
 
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree);
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree)
 
     void set_qnn_interface(const QnnInterface_t * qnn_interface) {
         _qnn_interface = qnn_interface;
@@ -398,7 +400,7 @@ class qnn_instance {
                           const std::string & model_name) :
             _lib_path(std::move(lib_path)),
             _backend_name(std::move(backend_name)),
-            _model_name(std::move(model_name)) {};
+            _model_name(std::move(model_name)) {}
 
     ~qnn_instance() {
     }
@@ -428,19 +430,19 @@ class qnn_instance {
         return _qnn_raw_system_interface;
     }
 
-    const Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
+    Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
 
-    const Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
+    Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
 
-    const Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
+    Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
 
-    const Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
+    Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
 
-    const Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
+    Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
 
-    const QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
+    QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
 
-    const Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
+    Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
 
     int init_qnn_graph(const char * graph_name,
                        bool debug,
diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 851eaf1b9a124..00cb7da32c183 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -200,25 +200,71 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
 
         auto  graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
         instance->_qnn_graph_map[graph_name] = graph_item;
-    }
+    } else {
+        Qnn_DataType_t src0_qnn_type    = QNN_DATATYPE_FLOAT_32;
+        Qnn_DataType_t src1_qnn_type    = QNN_DATATYPE_FLOAT_32;
+        Qnn_DataType_t dst_qnn_type     = QNN_DATATYPE_FLOAT_32;
+
+        src0_qnn_type                   = ggmlqnn_datatype_from_ggml_datatype(src0->type);
+        src1_qnn_type                   = ggmlqnn_datatype_from_ggml_datatype(src1->type);
+        dst_qnn_type                    = ggmlqnn_datatype_from_ggml_datatype(dst->type);
+
+        uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
+                                         (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
+        uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
+                                         (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
+        uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
+                                         (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
+
+        QNN_VER_PTR(*p_tensor0)->dimensions  = dimensions_input_0;
+        QNN_VER_PTR(*p_tensor0)->rank        = ggml_n_dims(src0);
+        QNN_VER_PTR(*p_tensor0)->dataType    = src0_qnn_type;
+
+        QNN_VER_PTR(*p_tensor1)->dimensions  = dimensions_input_1;
+        QNN_VER_PTR(*p_tensor1)->rank        = ggml_n_dims(src1);
+        QNN_VER_PTR(*p_tensor1)->dataType    = src1_qnn_type;
+
+        QNN_VER_PTR(*p_tensor2)->dimensions  = dimensions_output;
+        QNN_VER_PTR(*p_tensor2)->rank        = ggml_n_dims(dst);
+        QNN_VER_PTR(*p_tensor2)->dataType    = dst_qnn_type;
 
-    Qnn_Tensor_t tensor_inputs[] = {
-            *p_tensor0,
-            *p_tensor1
-    };
-    Qnn_Tensor_t tensor_outputs[] = {
-            *p_tensor2
-    };
-    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                        tensor_inputs, 2,
-                                                        tensor_outputs, 1,
-                                                        nullptr, nullptr));
-
-    if (enable_npu_rpc) {
-        //TODO:NPU RPC feature will failed with test-backend-ops
-        uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
-        if (nullptr != qnn_buffer_2) {
-            memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
+        if (enable_npu_rpc) {
+            //TODO: NPU RPC feature will failed with test-backend-ops
+            uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
+            if (nullptr != qnn_buffer_0) {
+                memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
+            }
+
+            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor1)->memHandle));
+            GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
+            if (nullptr != qnn_buffer_1) {
+                memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
+            }
+        } else {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+        }
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *p_tensor0,
+                *p_tensor1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *p_tensor2
+        };
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                            tensor_inputs, 2,
+                                                            tensor_outputs, 1,
+                                                            nullptr, nullptr));
+
+        if (enable_npu_rpc) {
+            //TODO:NPU RPC feature will failed with test-backend-ops
+            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
+            if (nullptr != qnn_buffer_2) {
+                memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
+            }
         }
     }
 
@@ -472,7 +518,6 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     const uint32_t src1_rank                    = ggml_n_dims(src1);
     GGML_ASSERT(src0_rank == src1_rank);
     GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
-    //GGML_ASSERT(src0_rank != 4); //TODO: 4D matrix mulmat
     if (4 == src0_rank) {
         return ggml_qnn_mul_mat_4d(ctx, op);
     }
@@ -591,13 +636,13 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         };
 #else
         Qnn_OpConfig_t out_0 = ggmlqnn_create_op_config("ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
-                                                out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
+                                                        out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
 #endif
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
 
         //step-5: compose qnn graph: add transpose node
         Qnn_Param_t out_trans1_0_params[] = {
-                {(Qnn_ParamType_t) 1,
+                {QNN_PARAMTYPE_TENSOR,
                  "perm", .tensorParam = *p_param_tensor
                 }
         };
@@ -617,12 +662,18 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         };
 #else
         Qnn_OpConfig_t out_trans1_0 = ggmlqnn_create_op_config("ggmlqnn_mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
-                                                       out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
+                                                               out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
 #endif
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
 
-        //step-6: finalize qnn graph
+        //step-6: finalize qnn graph and execute qnn graph
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+        Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0, *p_tensor1};
+        Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                            input_tensors_0, 2,
+                                                            output_tensors_0, 1,
+                                                            nullptr, nullptr));
 
         qnn_tensors_t ggml_op_mulmat_tensors;
         ggml_op_mulmat_tensors.reserve(5);
@@ -633,30 +684,30 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
         auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
         instance->_qnn_graph_map[graph_name] = graph_item;
-    }
-
-    if (src0_type != GGML_TYPE_F32) {
-        QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
     } else {
-        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+        if (src0_type != GGML_TYPE_F32) {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
+        } else {
+            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+        }
+        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+
+        Qnn_Tensor_t tensor_inputs[] = {
+                *p_tensor0,
+                *p_tensor1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *p_tensor2
+        };
+        // this is the second technical approach or another pipeline of "how to utilize the Hexagon
+        // NPU maximally" through QNN SDK, details could be found at
+        // https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
+        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                            tensor_inputs, 2,
+                                                            tensor_outputs, 1,
+                                                            nullptr, nullptr));
     }
-    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-    QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
-
-    Qnn_Tensor_t tensor_inputs[] = {
-            *p_tensor0,
-            *p_tensor1
-    };
-    Qnn_Tensor_t tensor_outputs[] = {
-            *p_tensor2
-    };
-    // this is the second technical approach or another pipeline of "how to utilize the Hexagon
-    // NPU maximally" through QNN SDK, details could be found at
-    // https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
-    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                        tensor_inputs, 2,
-                                                        tensor_outputs, 1,
-                                                        nullptr, nullptr));
 
     // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
     QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
@@ -666,67 +717,109 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
 }
 
 void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
-
 void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(value);
 }
 
 void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
@@ -734,10 +827,16 @@ void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
 }
 
 void ggml_qnn_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
 void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index a5533c5d4cab5..2aacf8f52d578 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -52,6 +52,7 @@ void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char *
     static std::mutex ggmlqnn_log_internal_mutex;
     static char s_ggmlqnn_log_internal_buf[GGML_QNN_LOGBUF_LEN];
 
+    GGML_UNUSED(file);
     {
         std::lock_guard<std::mutex> lock(ggmlqnn_log_internal_mutex);
         va_list args;
@@ -82,6 +83,7 @@ static const char * last_func = nullptr;
 static long last_err;
 void * dlopen(const char * dll, int flags) {
   HINSTANCE h = LoadLibraryA(dll);
+  GGML_UNUSED(flags);
   if (h == NULL) {
     last_err  = GetLastError();
     last_func = "dlopen";
@@ -174,7 +176,7 @@ static size_t ggmlqnn_memscpy(void * dst, size_t dst_size, const void * src, siz
 }
 
 static char * ggmlqnn_strndup(const char * source, size_t maxlen) {
-    return ::strndup(source, maxlen);
+    return strndup(source, maxlen);
 }
 
 static void * ggmlqnn_host_malloc(size_t n) {
@@ -553,8 +555,9 @@ Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package,
                            num_inputs, inputs,
                            num_outputs, outputs
     };
+    Qnn_OpConfig_t opcfg = {QNN_OPCONFIG_VERSION_1, {v1}};
 
-    return (Qnn_OpConfig_t){QNN_OPCONFIG_VERSION_1, .v1 = v1};
+    return opcfg;
 }
 
 // =================================================================================================
@@ -1069,9 +1072,6 @@ void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     GGML_ASSERT(nb00 == ggml_type_size(src0_type));
     GGML_ASSERT(nb10 == ggml_type_size(src1->type));
 
-    // broadcast factors
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
     const int64_t ne_plane = ne01 * ne00;
     const size_t desired_size = ((GGML_TYPE_F32 == src0_type) ? 0 : ne03 * ne02 * ne_plane * sizeof(float));
     ctx->desired_size   = desired_size;
@@ -1157,7 +1157,7 @@ size_t ggmlqnn_get_opcaps_size() {
 
 size_t ggmlqnn_get_op_index(const ggml_tensor * tensor) {
     if (tensor->op == GGML_OP_UNARY) {
-        return GGML_OP_COUNT + ggml_get_unary_op(tensor);
+        return static_cast<size_t>(GGML_OP_COUNT) + static_cast<size_t>(ggml_get_unary_op(tensor));
     }
 
     return tensor->op;
@@ -1280,8 +1280,6 @@ void qnn_instance::free_rpcmem(void * buf) {
 }
 
 void qnn_instance::free_rpcmem() {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-
     if (_rpcmem_store_map.empty()) {
         GGMLQNN_LOG_WARN("no rpcmem allocated\n");
         return;
@@ -1709,6 +1707,10 @@ static void ggml_qnn_logcallback(const char * fmt,
                                  QnnLog_Level_t level,
                                  uint64_t timestamp,
                                  va_list argp) {
+    GGML_UNUSED(fmt);
+    GGML_UNUSED(level);
+    GGML_UNUSED(timestamp);
+    GGML_UNUSED(argp);
 }
 #endif
 
@@ -1851,7 +1853,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
         GGMLQNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices);
         QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
-        for (int i = 0; i < p_info->v1.numHwDevices; i++) {
+        for (size_t i = 0; i < p_info->v1.numHwDevices; i++) {
             GGMLQNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId,
                          infos[i].v1.deviceType, infos[i].v1.numCores);
             QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
@@ -1863,7 +1865,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
                              chipinfo.socModel, qnn_get_socmodel_desc(chipinfo.socModel), \
                              htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize);
             struct qcom_socinfo * socinfo = qnn_get_socinfo_from_socmodel(chipinfo.socModel);
-            g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize };
+            g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {}};
             if (nullptr != socinfo) {
                 memcpy(g_qnn_mgr[QNN_BACKEND_NPU].socinfo.soc_desc, socinfo->soc_desc, sizeof(socinfo->soc_desc));
                 GGMLQNN_LOG_INFO("soc info:%s", socinfo->soc_desc);
@@ -2259,6 +2261,11 @@ static bool ggml_qnn_can_handle_op(const ggml_backend_qnn_context * ctx, const s
 
     const uint32_t src0_rank = ggml_n_dims(src0);
     const uint32_t src1_rank = ggml_n_dims(src1);
+    GGML_UNUSED(ne01);
+    GGML_UNUSED(ne10);
+    GGML_UNUSED(ne11);
+    GGML_UNUSED(ne0);
+    GGML_UNUSED(ne1);
 
     if (tensor->op == GGML_OP_ADD) {
         //dump_op_info(tensor);
@@ -2470,14 +2477,12 @@ static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) {
 
 static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) {
     ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
-
     return ctx->buffer;
 }
 
 static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
     ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
-    GGML_UNUSED(error);
+    GGML_UNUSED(tensor);
     GGML_UNUSED(ctx);
     return;
 }
@@ -2534,6 +2539,7 @@ static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = {
 };
 
 static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
     return "qnn-buffer";
 }
 
@@ -2541,7 +2547,13 @@ static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(
                                   ggml_backend_buffer_type_t buft, size_t size) {
     ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context;
 
+#if defined(__ANDROID__) || defined(__linux__)
     size_t size_page = sysconf(_SC_PAGESIZE);
+#elif defined(_WIN32)
+    SYSTEM_INFO systeminfo;
+    GetSystemInfo(&systeminfo);
+    size_t size_page = systeminfo.dwPageSize;
+#endif
     size_t size_aligned = size;
     if ((size_aligned % size_page) != 0) {
         size_aligned += (size_page - (size_aligned % size_page));
@@ -2561,11 +2573,11 @@ static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_typ
     return 32;
 }
 
-//FIXME: this value is an experimental value on Snapdragon 8 Gen3 based phone
+//TODO:not used currently
 static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
     GGML_UNUSED(buft);
 
-    return (2 * (1 << 30));
+    return (2 * (1 << 20));
 }
 
 static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) {
@@ -2645,6 +2657,7 @@ static const char * ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) {
 
 static const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) {
     struct ggml_backend_qnn_context * ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
+    static char qnn_device_desc[256];
     if (nullptr == ctx) {
         GGMLQNN_LOG_ERROR("pls check why ctx is null");
         return "unknown";
@@ -2655,7 +2668,9 @@ static const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t d
         std::string dev_desc = std::string(ctx->desc)
                 + std::string(soc_info) + "_" + std::string(htp_arch)
                 + "," + std::string(ctx->socinfo.soc_desc);
-        return dev_desc.c_str();
+        memset(qnn_device_desc, 0, 256);
+        memcpy(qnn_device_desc, dev_desc.c_str(), strlen(dev_desc.c_str()));
+        return qnn_device_desc;
     } else {
         return ctx->desc;
     }
@@ -2717,7 +2732,7 @@ static ggml_backend_t ggml_backend_qnn_device_init_backend(ggml_backend_dev_t de
 
 }
 
-ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) {
+static ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) {
     if (device_index >= GGML_QNN_MAX_DEVICES) {
         GGMLQNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n",
                       device_index, GGML_QNN_MAX_DEVICES - 1);
@@ -2733,6 +2748,7 @@ ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) {
                                      /* .get_alloc_size   = */ nullptr,// defaults to ggml_nbytes
                                      /* .is_host          = */ ggml_backend_qnn_buffer_is_host
                              },
+            /* .device  = */ nullptr,
             /* .context = */ nullptr,
     };
 
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 49079c9132769..3d239510b8d63 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -106,15 +106,15 @@ function check_qnn_libs()
 
 function update_qnn_libs()
 {
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so              ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so                 ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so                 ${REMOTE_PATH}/
-
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so                 ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so          ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so          ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so     ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so              ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so                 ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so                 ${REMOTE_PATH}/
+
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so                 ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so          ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so          ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so     ${REMOTE_PATH}/
 }
 
 
@@ -152,7 +152,7 @@ function run_llamacli()
 
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -no-ncv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
+               && ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
 
 }
 
diff --git a/tests/ggml-qnn-ut.cpp b/tests/ggml-qnn-ut.cpp
index 08d02e502b6ae..75d941263b82c 100644
--- a/tests/ggml-qnn-ut.cpp
+++ b/tests/ggml-qnn-ut.cpp
@@ -28,8 +28,9 @@
 #include <stdarg.h>
 #include <string.h>
 #include <stddef.h>
-#include <unistd.h>
 #include <inttypes.h>
+#if defined(__ANDROID__) || defined(__linux__)
+#include <unistd.h>
 #include <math.h>
 #include <time.h>
 #include <unistd.h>
@@ -40,6 +41,7 @@
 #include <signal.h>
 #include <fcntl.h>
 #include <sys/types.h>
+#endif
 
 #include <string>
 #include <vector>
@@ -70,41 +72,10 @@
 #include "ggml-backend.h"
 #include "ggml-qnn.h"
 
-#define GGML_QNN_DEBUG      1
-#define GGML_QNN_LOGBUF_LEN 4096
-
-#define QNN_LOG_ERROR(...)  ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG,  __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define QNN_LOG_WARN(...)   ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define QNN_LOG_INFO(...)   ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-
-#if GGML_QNN_DEBUG
-#define QNN_LOG_DEBUG(...)  ggml_qnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#else
-#define QNN_LOG_DEBUG(...)
-#endif
-
 static void tensor_dump(const ggml_tensor * tensor, const char * name);
 
 #define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
 
-static void ggml_qnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
-    static std::mutex ggml_qnn_log_internal_mutex;
-    static char s_ggml_qnn_log_internal_buf[GGML_QNN_LOGBUF_LEN];
-
-    {
-        std::lock_guard<std::mutex> lock(ggml_qnn_log_internal_mutex);
-        va_list args;
-        va_start(args, format);
-        int len_prefix = snprintf(s_ggml_qnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line);
-        int len = vsnprintf(s_ggml_qnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args);
-        if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) {
-            printf("%s", s_ggml_qnn_log_internal_buf);
-        }
-        va_end(args);
-    }
-}
-
-
 static bool ggml_graph_compute_helper(
         struct ggml_backend * backend,
         struct ggml_cgraph * graph,
@@ -142,8 +113,8 @@ static void tensor_dump_elements(const ggml_tensor * tensor) {
                         tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value
                                << " ";
                     }
-                    if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) {
-                        QNN_LOG_DEBUG("%s\n", tmposs.str().c_str());
+                    if (strlen(tmposs.str().c_str()) <= (4096 - 96)) {
+                        printf("%s\n", tmposs.str().c_str());
                     }
                     tmposs.clear();
                     tmposs.str("");
@@ -152,20 +123,20 @@ static void tensor_dump_elements(const ggml_tensor * tensor) {
         }
     }
 
-    QNN_LOG_DEBUG("\n");
+    printf("\n");
 }
 
 
 static void tensor_dump(const ggml_tensor * tensor, const char * name) {
-    QNN_LOG_DEBUG("dump ggml tensor %s(%s)\n", name, tensor->name);
-    QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n",
+    printf("dump ggml tensor %s(%s)\n", name, tensor->name);
+    printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n",
           name,
           tensor->type, ggml_type_name(tensor->type),
           tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
           tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[2]);
     tensor_dump_elements(tensor);
 
-    QNN_LOG_DEBUG("\n");
+    printf("\n");
 }
 
 
@@ -181,15 +152,6 @@ static uint32_t get_tensor_rank(const ggml_tensor * tensor) {
 
 
 static uint32_t get_tensor_data_size(const ggml_tensor * tensor) {
-    size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
-    size_t n_dims = get_tensor_rank(tensor);
-    for (size_t i = 1; i < n_dims; i++) {
-        data_size *= tensor->ne[i];
-    }
-
-    QNN_LOG_DEBUG("get_tensor_data_size %d", data_size);
-    QNN_LOG_DEBUG("ggml_nbytes(tensor) %d", ggml_nbytes(tensor));
-
     return ggml_nbytes(tensor);
 }
 
@@ -273,9 +235,6 @@ static void show_usage() {
 }
 
 
-struct ggml_backend_deleter        { void operator()(ggml_backend_t backend)       { ggml_backend_free(backend); } };
-typedef std::unique_ptr<ggml_backend,        ggml_backend_deleter>        ggml_backend_ptr;
-
 int main(int argc, char * argv[]) {
     int64_t n_begin_time        = 0LL;
     int64_t n_end_time          = 0LL;
@@ -329,8 +288,7 @@ int main(int argc, char * argv[]) {
             return 1;
         }
     }
-    std::vector<ggml_backend_ptr> backends;
-    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
+
     printf("Testing %zu devices\n\n", ggml_backend_dev_count());
     for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
         ggml_backend_dev_t dev = ggml_backend_dev_get(i);
@@ -348,14 +306,6 @@ int main(int argc, char * argv[]) {
         if (backend != nullptr) {
             printf("%s: initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
         }
-        backends.emplace_back(backend);
-
-        ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);
-        auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(
-                reg, "ggml_backend_set_n_threads");
-        if (ggml_backend_set_n_threads_fn) {
-            ggml_backend_set_n_threads_fn(backend, std::thread::hardware_concurrency());
-        }
 
         printf("  Device description: %s\n", ggml_backend_dev_description(dev));
         size_t free, total;
@@ -367,23 +317,19 @@ int main(int argc, char * argv[]) {
     ggml_backend_t backend_cpu = nullptr;
     backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
     if (nullptr == backend_cpu) {
-        QNN_LOG_DEBUG("failed to initialize cpu backend\n");
+        printf("failed to initialize cpu backend\n");
         exit(1);
     } else {
-        QNN_LOG_DEBUG("succeed to initialize cpu backend\n");
+        printf("succeed to initialize cpu backend\n");
     }
-    backends.emplace_back(backend_cpu);
-
-    size_t n_ok = 0;
 
-    QNN_LOG_DEBUG("enter qnn_ggml_op\n");
-    QNN_LOG_DEBUG("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type));
+    printf("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type));
 
     n_begin_time = ggml_time_us();
     srand(time(NULL));
 
     ctx_size += 1024 * 1024 * 32;
-    QNN_LOG_DEBUG("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size,
+    printf("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size,
                     (ctx_size / 1024 / 1024));
 
     struct ggml_init_params params = {
@@ -392,38 +338,19 @@ int main(int argc, char * argv[]) {
             /* no_alloc   =*/ 0
     };
 
-    int idx = 0;
-    for (auto & backend_it : backends) {
-        if (idx == n_backend_type) {
-            backend = backend_it.get();
-        }
-        idx++;
-        ggml_backend_dev_t dev = ggml_backend_get_device(backend_it.get());
-        ggml_backend_reg_t reg = dev ? ggml_backend_dev_backend_reg(dev) : nullptr;
-        if (reg) {
-            auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
-            if (ggml_backend_set_n_threads_fn) {
-                set_n_threads_fns.emplace_back(backend_it.get(), ggml_backend_set_n_threads_fn);
-            }
-        }
-        const char * name = ggml_backend_dev_description(dev);
-        QNN_LOG_DEBUG("dev name %s\n", name);
-
-    }
-
     if (n_backend_type != QNN_BACKEND_GGML) {
         params.no_alloc = true;
     }
 
     ctx = ggml_init(params);
     if (!ctx) {
-        QNN_LOG_ERROR("%s: ggml_init() failed\n");
+        printf("ggml_init() failed\n");
         return 2;
     }
 
-    QNN_LOG_DEBUG("creating new tensors\n");
-    QNN_LOG_DEBUG("ggml_blck_size(%s) %d\n", ggml_type_name(qtype), ggml_blck_size(qtype));
-    QNN_LOG_DEBUG("ggml_type_size(%s) %d\n", ggml_type_name(qtype), ggml_type_size(qtype));
+    printf("creating new tensors\n");
+    printf("ggml_blck_size(%s) %ld\n", ggml_type_name(qtype), ggml_blck_size(qtype));
+    printf("ggml_type_size(%s) %ld\n", ggml_type_name(qtype), ggml_type_size(qtype));
     if (qtype != GGML_TYPE_F32) {
         sizex = ggml_blck_size(qtype);
     }
@@ -461,7 +388,7 @@ int main(int argc, char * argv[]) {
             dst = ggml_mul_mat(ctx, src0, src1);
             break;
         default:
-            QNN_LOG_WARN("ggml op %d(%s) not supported", n_ggml_op_type,
+            printf("ggml op %d(%s) not supported", n_ggml_op_type,
                   ggml_op_name((enum ggml_op) n_ggml_op_type));
             ggml_free(ctx);
             ggml_backend_free(backend);
@@ -472,32 +399,32 @@ int main(int argc, char * argv[]) {
 
 #ifdef GGML_USE_QNN
     if (n_backend_type != QNN_BACKEND_GGML) {
-        QNN_LOG_DEBUG("init QNN backend %d\n", n_backend_type);
+        printf("init QNN backend %d\n", n_backend_type);
         //re-init again
         backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/");
         if (nullptr == backend) {
-            QNN_LOG_ERROR("create qnn backend %d(%s) failed\n", n_backend_type, ggml_backend_qnn_get_devname(n_backend_type));
+            printf("create qnn backend %d(%s) failed\n", n_backend_type, ggml_backend_qnn_get_devname(n_backend_type));
             return 1;
         } else {
-            QNN_LOG_INFO("create qnn backend %d(%s) succeed\n", n_backend_type, ggml_backend_qnn_get_devname(n_backend_type));
+            printf("create qnn backend %d(%s) succeed\n", n_backend_type, ggml_backend_qnn_get_devname(n_backend_type));
         }
 
         //buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
         ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
         buffer = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
         if (!buffer) {
-            QNN_LOG_ERROR("%s: failed to allocate backend buffer\n", __func__);
+            printf("%s: failed to allocate backend buffer\n", __func__);
             ggml_free(ctx);
             ggml_backend_free(backend);
             return 4;
         }
     } else {
-        QNN_LOG_DEBUG("init default cpu backend\n");
+        printf("init default cpu backend\n");
         backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
     }
 #endif
 
-    QNN_LOG_DEBUG("creating compute graph\n");
+    printf("creating compute graph\n");
     gf = ggml_new_graph(ctx);
     ggml_build_forward_expand(gf, dst);
 
@@ -519,20 +446,20 @@ int main(int argc, char * argv[]) {
 
     ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr);
     if (get_tensor_data_size(dst) < (100 * 100)) {
-        QNN_LOG_DEBUG("dump result tensors:\n");
+        printf("dump result tensors:\n");
         TENSOR_DUMP(src0);
         TENSOR_DUMP(src1);
         TENSOR_DUMP(dst);
     } else {
-        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+        printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
               src0->name,
               src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
               src0->nb[0], src0->nb[1], src0->nb[2]);
-        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+        printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
               src1->name,
               src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
               src1->nb[0], src1->nb[1], src1->nb[2]);
-        QNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
+        printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
               dst->name,
               dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
               dst->nb[1], dst->nb[2]);
@@ -546,7 +473,7 @@ int main(int argc, char * argv[]) {
     n_end_time = ggml_time_us();
     n_duration = (n_end_time - n_begin_time) / 1000;
 #ifdef GGML_USE_QNN
-    QNN_LOG_DEBUG("duration of ut GGML_OP_%s using QNN backend %s: %lld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), ggml_backend_qnn_get_devname(n_backend_type), n_duration);
+    printf("duration of ut GGML_OP_%s using QNN backend %s: %ld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), ggml_backend_qnn_get_devname(n_backend_type), n_duration);
 #endif
 
     return 0;

From af5f952d23eb850c3f7776ef68497635291a4f11 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 4 Mar 2025 11:14:52 +0800
Subject: [PATCH 113/200] ggml-qnn: remove un-needed function

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 2aacf8f52d578..c8db722b19054 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -919,19 +919,6 @@ static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
     return GGML_TYPE_COUNT;
 }
 
-//TODO: add more ops
-static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) {
-    switch (ggmlop) {
-        case GGML_OP_ADD:
-            return QNN_OP_ELEMENT_WISE_ADD;
-        case GGML_OP_MUL_MAT:
-            return QNN_OP_MAT_MUL;
-        default:
-            break;
-    }
-    return nullptr;
-}
-
 static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, const uint32_t * ggml_dimensions, uint32_t rank) {
     if (rank > GGML_MAX_DIMS) {
         GGMLQNN_LOG_WARN("invalid params");
@@ -1007,14 +994,13 @@ Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const c
                     .type = qnn_tensor_type,
                     .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
                     .dataType = qnn_data_type,
-                    .quantizeParams = {QNN_DEFINITION_UNDEFINED,
-                                       QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                    .quantizeParams = {.encodingDefinition = QNN_DEFINITION_UNDEFINED,
+                                       .quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED,
                                        {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}},
                     .rank = rank,
                     .dimensions = tensor_dims,
                     .memType = QNN_TENSORMEMTYPE_RAW,
-                    {.clientBuf = {nullptr, 0}
-                    }
+                    .clientBuf = {.data = nullptr, .dataSize = 0}
             }
             }
     };

From a62276540792c8a3198fce20c9d690c0932af726 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 4 Mar 2025 11:51:25 +0800
Subject: [PATCH 114/200] ggml-qnn:rebase to upstream

---
 ggml/src/ggml-qnn/ggml-qnn-ops.h |  5 -----
 ggml/src/ggml-qnn/ggml-qnn.cpp   | 20 ++------------------
 2 files changed, 2 insertions(+), 23 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.h b/ggml/src/ggml-qnn/ggml-qnn-ops.h
index c25638a9397c6..b1c388a32a87a 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.h
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.h
@@ -24,13 +24,8 @@
 #include "ggml-qnn-impl.h"
 void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-<<<<<<< HEAD
 
 void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-=======
-void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_add(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
->>>>>>> ggml-qnn: refine source code structure to make code more clearly
 void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 void ggml_qnn_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index c8db722b19054..c47a07307003b 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -1170,22 +1170,6 @@ void ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & output)
     }
 }
 
-bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                             const ggml_tensor * src1, ggml_tensor * dst) {
-    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
-        GGMLQNN_LOG_WARN("invalid params\n");
-        return false;
-    }
-
-    qnn_instance * instance = ctx->instance;
-    if (nullptr == instance) {
-        GGMLQNN_LOG_WARN("invalid params\n");
-        return false;
-    }
-
-    return true;
-}
-
 template<typename Fn>
 Fn load_qnn_functionpointers(void * handle, const char * function_name) {
     return reinterpret_cast<Fn>(dlsym(handle, function_name));
@@ -2466,11 +2450,11 @@ static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) {
     return ctx->buffer;
 }
 
-static void ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+static enum ggml_status ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
     ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
     GGML_UNUSED(tensor);
     GGML_UNUSED(ctx);
-    return;
+    return GGML_STATUS_SUCCESS;
 }
 
 static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer,

From a645a7d5cbb5ed5eedf6170fac66226b10c3f6c6 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 4 Mar 2025 12:17:25 +0800
Subject: [PATCH 115/200] ggml-qnn: fix a minior issue during rebase to
 upstream

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index c47a07307003b..2fa1efbfd7a92 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -2244,8 +2244,7 @@ static bool ggml_qnn_can_handle_op(const ggml_backend_qnn_context * ctx, const s
         }
         if (ne00 < 32)
             return false;
-        return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16)
-               && (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16);
+        return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32);
     }
 
     if (tensor->op == GGML_OP_MUL_MAT) {

From 10fd07dfdc89f5cdb293da8822b9f99b83b18900 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 4 Mar 2025 15:47:28 +0800
Subject: [PATCH 116/200] ggml-qnn: update script according to
 https://github.com/ggml-org/llama.cpp/pull/12155

---
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp |   4 +-
 scripts/build-run-android.sh       | 238 +++++++++++++++++++++++++++--
 2 files changed, 229 insertions(+), 13 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 00cb7da32c183..8db6662c8f0bc 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -65,14 +65,12 @@ static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const
 */
 void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
-    enum ggml_status result                     = GGML_STATUS_SUCCESS;
     bool graph_initialized                      = false;
     qnn_instance * instance                     = nullptr;
     Qnn_GraphHandle_t graph_handle              = nullptr;
     Qnn_Tensor_t * p_tensor0                    = nullptr;
     Qnn_Tensor_t * p_tensor1                    = nullptr;
     Qnn_Tensor_t * p_tensor2                    = nullptr;
-    Qnn_Param_t qnn_params[]                    = {};
     const ggml_tensor * src0                    = op->src[0];
     const ggml_tensor * src1                    = op->src[1];
     ggml_tensor * dst                           = op;
@@ -170,7 +168,7 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
                         QNN_OP_PACKAGE_NAME_QTI_AISW,
                         qnn_op_name,
                         0,
-                        qnn_params,
+                        nullptr,
                         2,
                         tensor_inputs,
                         1,
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 3d239510b8d63..2ed8db9349003 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -168,7 +168,7 @@ function run_llamabench()
 }
 
 
-function run_test-backend-ops()
+function run_test-ops()
 {
     prepare_run_on_phone test-backend-ops
 
@@ -178,6 +178,38 @@ function run_test-backend-ops()
 
 }
 
+function run_test-op()
+{
+    prepare_run_on_phone test-backend-ops
+
+    qnnbackendname=qnn-cpu
+    case $qnnbackend in
+        0)
+        qnnbackendname=qnn-cpu
+        ;;
+        1)
+        qnnbackendname=qnn-gpu
+        ;;
+        2)
+        qnnbackendname=qnn-npu
+        ;;
+        *)
+        qnnbackendname=qnn-cpu
+        ;;
+    esac
+
+    #debug
+    echo "adb shell cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test -o $opname -b $qnnbackendname "
+
+    echo "\n"
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test -o $opname -b $qnnbackendname "
+
+}
+
 function run_ut_add()
 {
     prepare_run_on_phone ggml-qnn-ut
@@ -208,18 +240,101 @@ function run_ut_mul()
 
 }
 
+function print_oplist()
+{
+oplist="DUP
+    ADD
+    ADD1
+    ACC
+    SUB
+    MUL
+    DIV
+    SQR
+    SQRT
+    LOG
+    SIN
+    COS
+    SUM
+    SUM_ROWS
+    MEAN
+    ARGMAX
+    COUNT_EQUAL
+    REPEAT
+    REPEAT_BACK
+    CONCAT
+    SILU_BACK
+    NORM
+    RMS_NORM
+    RMS_NORM_BACK
+    GROUP_NORM
+
+    MUL_MAT
+    MUL_MAT_ID
+    OUT_PROD
+
+    SCALE
+    SET
+    CPY
+    CONT
+    RESHAPE
+    VIEW
+    PERMUTE
+    TRANSPOSE
+    GET_ROWS
+    GET_ROWS_BACK
+    DIAG
+    DIAG_MASK_INF
+    DIAG_MASK_ZERO
+    SOFT_MAX
+    SOFT_MAX_BACK
+    ROPE
+    ROPE_BACK
+    CLAMP
+    CONV_TRANSPOSE_1D
+    IM2COL
+    IM2COL_BACK
+    CONV_TRANSPOSE_2D
+    POOL_1D
+    POOL_2D
+    POOL_2D_BACK
+    UPSCALE
+    PAD
+    PAD_REFLECT_1D
+    ARANGE
+    TIMESTEP_EMBEDDING
+    ARGSORT
+    LEAKY_RELU
+
+    FLASH_ATTN_EXT
+    FLASH_ATTN_BACK
+    SSM_CONV
+    SSM_SCAN
+    WIN_PART
+    WIN_UNPART
+    GET_REL_POS
+    ADD_REL_POS
+    RWKV_WKV6
+    GATED_LINEAR_ATTN"
+
+echo "opname list: "
+echo ${oplist}
+}
 
 function show_usage()
 {
     echo "Usage:"
+    echo "  $0 help"
+    echo "  $0 print_oplist"
     echo "  $0 build"
     echo "  $0 updateqnnlib"
-    echo "  $0 run_testop"
-    echo "  $0 run_ut_add       0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo "  $0 run_ut_mulmat    0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo "  $0 run_ut_mul       0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo "  $0 run_llamacli     0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo "  $0 run_llamabench   0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_testops"
+    echo "  $0 run_testop          [ADD/MUL/MUL_MAT]  [0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU)]"
+    echo "  $0 run_ut_add          0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_ut_mulmat       0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_ut_mul          0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_llamacli        0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_llamabench      0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+
     echo -e "\n\n\n"
 }
 
@@ -238,12 +353,14 @@ elif [ $# == 1 ]; then
     elif [ "$1" == "help" ]; then
         show_usage
         exit 1
+    elif [ "$1" == "print_oplist" ]; then
+        print_oplist
+        exit 1
     elif [ "$1" == "build" ]; then
         build_ggml_qnn
         exit 0
-
-    elif [ "$1" == "run_testop" ]; then
-        run_test-backend-ops
+    elif [ "$1" == "run_testops" ]; then
+        run_test-ops
         exit 0
 
     elif [ "$1" == "updateqnnlib" ]; then
@@ -276,6 +393,107 @@ elif [ $# == 2 ]; then
         run_ut_mul
         exit 0
     fi
+elif [ $# == 3 ]; then
+    opname=$2
+#TODO: check opname in oplist
+#opname can be found via print_oplist:
+#    DUP
+#    ADD
+#    ADD1
+#    ACC
+#    SUB
+#    MUL
+#    DIV
+#    SQR
+#    SQRT
+#    LOG
+#    SIN
+#    COS
+#    SUM
+#    SUM_ROWS
+#    MEAN
+#    ARGMAX
+#    COUNT_EQUAL
+#    REPEAT
+#    REPEAT_BACK
+#    CONCAT
+#    SILU_BACK
+#    NORM
+#    RMS_NORM
+#    RMS_NORM_BACK
+#    GROUP_NORM
+#
+#    MUL_MAT
+#    MUL_MAT_ID
+#    OUT_PROD
+#
+#    SCALE
+#    SET
+#    CPY
+#    CONT
+#    RESHAPE
+#    VIEW
+#    PERMUTE
+#    TRANSPOSE
+#    GET_ROWS
+#    GET_ROWS_BACK
+#    DIAG
+#    DIAG_MASK_INF
+#    DIAG_MASK_ZERO
+#    SOFT_MAX
+#    SOFT_MAX_BACK
+#    ROPE
+#    ROPE_BACK
+#    CLAMP
+#    CONV_TRANSPOSE_1D
+#    IM2COL
+#    IM2COL_BACK
+#    CONV_TRANSPOSE_2D
+#    POOL_1D
+#    POOL_2D
+#    POOL_2D_BACK
+#    UPSCALE
+#    PAD
+#    PAD_REFLECT_1D
+#    ARANGE
+#    TIMESTEP_EMBEDDING
+#    ARGSORT
+#    LEAKY_RELU
+#
+#    FLASH_ATTN_EXT
+#    FLASH_ATTN_BACK
+#    SSM_CONV
+#    SSM_SCAN
+#    WIN_PART
+#    WIN_UNPART
+#    GET_REL_POS
+#    ADD_REL_POS
+#    RWKV_WKV6
+#    GATED_LINEAR_ATTN
+#
+#    UNARY
+#
+#    MAP_UNARY
+#    MAP_BINARY
+#
+#    MAP_CUSTOM1_F32
+#    MAP_CUSTOM2_F32
+#    MAP_CUSTOM3_F32
+#
+#    MAP_CUSTOM1
+#    MAP_CUSTOM2
+#    MAP_CUSTOM3
+#
+#    CROSS_ENTROPY_LOSS
+#    CROSS_ENTROPY_LOSS_BACK
+#    OPT_STEP_ADAMW
+    qnnbackend=$3
+    if [ ${qnnbackend} -gt 3 ]; then
+        show_usage
+        exit 1
+    fi
+    run_test-op
+    exit 0
 else
     show_usage
     exit 1

From c0eebf26225bf46b0dbd33078217e12f1315c645 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 4 Mar 2025 17:49:04 +0800
Subject: [PATCH 117/200] ggml-qnn: fix a minior issue in
 ggmlqnn_create_general_tensor()

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 2fa1efbfd7a92..4f9a308914778 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -947,7 +947,7 @@ Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const c
     char tensor_name[GGML_MAX_NAME] = {};
 
     //ensure the tensor name is unique
-    if (nullptr != name) {
+    if (nullptr == name) {
         snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", get_idx());
     } else {
         snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, get_idx());
@@ -1857,6 +1857,12 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         if (0 != set_high_performance_mode()) {
             GGMLQNN_LOG_WARN("set HTP high performance mode failure");
         }
+
+        if (enable_qnn_rpc()) {
+            GGMLQNN_LOG_INFO("NPU RPC feature enabled");
+        } else {
+            GGMLQNN_LOG_INFO("NPU RPC feature disabled");
+        }
     }
 
     GGMLQNN_LOG_DEBUG("leave qni_init\n");

From a69adbf3d2e1bcac35a1eef26be5c4f4a115abec Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 4 Mar 2025 20:35:14 +0800
Subject: [PATCH 118/200] ggml-qnn: active member variable _device_id in class
 qnn_instance

---
 ggml/src/ggml-qnn/ggml-qnn-impl.h  |  4 ++++
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp |  1 +
 ggml/src/ggml-qnn/ggml-qnn.cpp     | 11 +++++++++++
 3 files changed, 16 insertions(+)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-impl.h b/ggml/src/ggml-qnn/ggml-qnn-impl.h
index 5a2fe5752a097..68662d31d3738 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-impl.h
+++ b/ggml/src/ggml-qnn/ggml-qnn-impl.h
@@ -498,6 +498,10 @@ class qnn_instance {
         return _enable_qnn_rpc;
     }
 
+    QNNBackend get_device_id() {
+        return _device_id;
+    }
+
 public:
     std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>> _qnn_graph_map;
 
diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 8db6662c8f0bc..d96c33d574b41 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -118,6 +118,7 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
 
     if (!graph_initialized) {
         GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
+        GGML_ASSERT(instance->get_device_id() == ctx->device);
         error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8);
         if (QNN_SUCCESS != error) {
             GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 4f9a308914778..a12065f306e56 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -1704,6 +1704,17 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         }
     }
 
+    _device_id = QNN_BACKEND_CPU;
+    if (_backend_name.find("QnnCpu") != std::string::npos) {
+        _device_id = QNN_BACKEND_CPU;
+    }
+    if (_backend_name.find("QnnGpu") != std::string::npos) {
+        _device_id = QNN_BACKEND_GPU;
+    }
+    if (_backend_name.find("QnnHtp") != std::string::npos) {
+        _device_id = QNN_BACKEND_NPU;
+    }
+
     backend_id = _lib_path_to_backend_id[backend_lib_path];
     if (0 == _loaded_backend.count(backend_id) ||
         0 == _loaded_lib_handle.count(backend_id)) {

From 9ae615317338bc520383c3da0b40ea47616b9734 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 4 Mar 2025 22:29:06 +0800
Subject: [PATCH 119/200] ggml-qnn: refine ggml_qnn_general_node and
 ggml_qnn_mul_mat to make code more clearly

---
 ggml/src/ggml-qnn/ggml-qnn-impl.h  |   2 +-
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp | 406 +++++++++--------------------
 ggml/src/ggml-qnn/ggml-qnn.cpp     |  27 +-
 3 files changed, 147 insertions(+), 288 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn-impl.h b/ggml/src/ggml-qnn/ggml-qnn-impl.h
index 68662d31d3738..a0a1a80cbf855 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-impl.h
+++ b/ggml/src/ggml-qnn/ggml-qnn-impl.h
@@ -595,12 +595,12 @@ class qnn_instance {
 
 size_t         ggmlqnn_get_opcaps_size(void);
 size_t         ggmlqnn_get_op_index(const ggml_tensor * tensor);
-Qnn_Tensor_t * ggmlqnn_create_compute_tensor(const ggml_tensor * tensor);
 const char   * ggmlqnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code);
 Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype);
 void         * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op);
 void           ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & output);
 uint8_t      * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata);
+Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn_GraphHandle_t handle, const ggml_tensor * tensor, Qnn_TensorType_t tensor_type);
 void           ggmlqnn_print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
 
 Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index d96c33d574b41..8a4ed15529b4c 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -65,7 +65,6 @@ static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const
 */
 void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
-    bool graph_initialized                      = false;
     qnn_instance * instance                     = nullptr;
     Qnn_GraphHandle_t graph_handle              = nullptr;
     Qnn_Tensor_t * p_tensor0                    = nullptr;
@@ -87,75 +86,36 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     qnn_perf op_perf                            = qnn_perf(ggml_op_name);
     op_perf.start();
 
+    //ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
+    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
+
     std::string graph_name;
     ggmlqnn_get_graphkey_from_op(op, graph_name);
     if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
-        graph_initialized = true;
-        qnn_res_t & graph_item = instance->_qnn_graph_map[graph_name];
-        graph_handle = std::get<0>(graph_item);
-        qnn_tensors_t & tensor = std::get<1>(graph_item);
-        p_tensor0     = tensor[0];
-        p_tensor1     = tensor[1];
-        p_tensor2     = tensor[2];
+        //retrieve computational resource from cached QNN graph
+        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
+        graph_handle            = std::get<0>(graph_item);
+        qnn_tensors_t & tensor  = std::get<1>(graph_item);
+        p_tensor0               = tensor[0];
+        p_tensor1               = tensor[1];
+        p_tensor2               = tensor[2];
     } else {
-        p_tensor0 = ggmlqnn_create_compute_tensor(src0);
-        p_tensor1 = ggmlqnn_create_compute_tensor(src1);
-        p_tensor2 = ggmlqnn_create_compute_tensor(dst);
-    }
-    //ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
-
-    //ensure QNN tensor has correct tensor type
-    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
-
-    //save the original dimensions of qnn tensors
-    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
-    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
-    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
-
-    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
-
-    if (!graph_initialized) {
         GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
         GGML_ASSERT(instance->get_device_id() == ctx->device);
+        //create QNN graph
         error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8);
         if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
+            GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
             return;
         }
         graph_handle = instance->get_qnn_graph_handle();
 
-        if (enable_npu_rpc) {
-            QNN_VER_PTR(*p_tensor0)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {.data=nullptr, .dataSize=0};
-
-            QNN_VER_PTR(*p_tensor1)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {.data=nullptr, .dataSize=0};
-
-            QNN_VER_PTR(*p_tensor2)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-            QNN_VER_PTR(*p_tensor2)->clientBuf = {.data=nullptr, .dataSize=0};
-        }
-
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
-
-        if (enable_npu_rpc) {
-            uint8_t * qnn_rpcbuffer_0 = ggmlqnn_create_rpc_buffer(instance, src0, p_tensor0, true);
-            uint8_t * qnn_rpcbuffer_1 = ggmlqnn_create_rpc_buffer(instance, src1, p_tensor1, true);
-            uint8_t * qnn_rpcbuffer_2 = ggmlqnn_create_rpc_buffer(instance, dst, p_tensor2, false);
-            if (nullptr == qnn_rpcbuffer_0 || nullptr == qnn_rpcbuffer_1 || nullptr == qnn_rpcbuffer_2) {
-                GGMLQNN_LOG_INFO("create rpc buffer failure\n");
-                //TODO: potential memory leak although it shouldn't happen
-                return;
-            }
-        } else {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
-        }
+        //create computational tensor
+        p_tensor0 = ggmlqnn_create_compute_tensor(instance, graph_handle, src0, QNN_TENSOR_TYPE_APP_WRITE);
+        p_tensor1 = ggmlqnn_create_compute_tensor(instance, graph_handle, src1, QNN_TENSOR_TYPE_APP_WRITE);
+        p_tensor2 = ggmlqnn_create_compute_tensor(instance, graph_handle, dst,  QNN_TENSOR_TYPE_APP_READ);
 
+        //compose QNN graph
         Qnn_Tensor_t tensor_inputs[] = {
                 *p_tensor0,
                 *p_tensor1
@@ -177,100 +137,55 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
                 }
         };
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
+        //finalize QNN graph
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            tensor_inputs, 2,
-                                                            tensor_outputs, 1,
-                                                            nullptr, nullptr));
-
-        if (enable_npu_rpc) {
-            uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer = %p\n", qnn_rpcbuffer);
-            if (nullptr != qnn_rpcbuffer) {
-                memcpy(dst->data, qnn_rpcbuffer, ggml_nbytes(dst));
-            }
-        }
 
+        //cache QNN graph
         qnn_tensors_t ggml_op_add_tensors;
         ggml_op_add_tensors.reserve(3);
         ggml_op_add_tensors.push_back(p_tensor0);
         ggml_op_add_tensors.push_back(p_tensor1);
         ggml_op_add_tensors.push_back(p_tensor2);
-
         auto  graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
         instance->_qnn_graph_map[graph_name] = graph_item;
-    } else {
-        Qnn_DataType_t src0_qnn_type    = QNN_DATATYPE_FLOAT_32;
-        Qnn_DataType_t src1_qnn_type    = QNN_DATATYPE_FLOAT_32;
-        Qnn_DataType_t dst_qnn_type     = QNN_DATATYPE_FLOAT_32;
-
-        src0_qnn_type                   = ggmlqnn_datatype_from_ggml_datatype(src0->type);
-        src1_qnn_type                   = ggmlqnn_datatype_from_ggml_datatype(src1->type);
-        dst_qnn_type                    = ggmlqnn_datatype_from_ggml_datatype(dst->type);
-
-        uint32_t dimensions_input_0[] = {(uint32_t) src0->ne[0], (uint32_t) src0->ne[1],
-                                         (uint32_t) src0->ne[2], (uint32_t) src0->ne[3]};
-        uint32_t dimensions_input_1[] = {(uint32_t) src1->ne[0], (uint32_t) src1->ne[1],
-                                         (uint32_t) src1->ne[2], (uint32_t) src1->ne[3]};
-        uint32_t dimensions_output[]  = {(uint32_t) dst->ne[0], (uint32_t) dst->ne[1],
-                                         (uint32_t) dst->ne[2], (uint32_t) dst->ne[3]};
-
-        QNN_VER_PTR(*p_tensor0)->dimensions  = dimensions_input_0;
-        QNN_VER_PTR(*p_tensor0)->rank        = ggml_n_dims(src0);
-        QNN_VER_PTR(*p_tensor0)->dataType    = src0_qnn_type;
-
-        QNN_VER_PTR(*p_tensor1)->dimensions  = dimensions_input_1;
-        QNN_VER_PTR(*p_tensor1)->rank        = ggml_n_dims(src1);
-        QNN_VER_PTR(*p_tensor1)->dataType    = src1_qnn_type;
-
-        QNN_VER_PTR(*p_tensor2)->dimensions  = dimensions_output;
-        QNN_VER_PTR(*p_tensor2)->rank        = ggml_n_dims(dst);
-        QNN_VER_PTR(*p_tensor2)->dataType    = dst_qnn_type;
-
-        if (enable_npu_rpc) {
-            //TODO: NPU RPC feature will failed with test-backend-ops
-            uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
-            if (nullptr != qnn_buffer_0) {
-                memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
-            }
-
-            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor1)->memHandle));
-            GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
-            if (nullptr != qnn_buffer_1) {
-                memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
-            }
-        } else {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-            QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+    }
+
+    if (enable_npu_rpc) {
+        uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
+        GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
+        if (nullptr != qnn_buffer_0) {
+            memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
         }
 
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *p_tensor2
-        };
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            tensor_inputs, 2,
-                                                            tensor_outputs, 1,
-                                                            nullptr, nullptr));
-
-        if (enable_npu_rpc) {
-            //TODO:NPU RPC feature will failed with test-backend-ops
-            uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
-            if (nullptr != qnn_buffer_2) {
-                memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
-            }
+        uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor1)->memHandle));
+        GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
+        if (nullptr != qnn_buffer_1) {
+            memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
         }
+    } else {
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
     }
 
-    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
-    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
-    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
-    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
+    Qnn_Tensor_t tensor_inputs[] = {
+            *p_tensor0,
+            *p_tensor1
+    };
+    Qnn_Tensor_t tensor_outputs[] = {
+            *p_tensor2
+    };
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                        tensor_inputs, 2,
+                                                        tensor_outputs, 1,
+                                                        nullptr, nullptr));
+    if (enable_npu_rpc) {
+        //TODO:NPU RPC feature will failed with test-backend-ops
+        uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
+        if (nullptr != qnn_buffer_2) {
+            memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
+        }
+    }
 
 #if GGMLQNN_PRINT_OP_ADD_LOG
     op_perf.info();
@@ -478,8 +393,35 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
  *        `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
  *        and stores the result in the destination tensor `dst`.
  *
- * @param backend the context which got through (ggml_backend_qnn_context *)backend->context for the
- *                QNN backend operations.
+         there are two key-points in properly handling how to offload mulmat to the QNN backend in ggml-qnn
+         1. transpose
+            a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from:
+            struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
+            which like this:
+            +---+---+
+            | 0 | 1 |
+            +---+---+
+            | 2 | 3 |
+            +---+---+
+            | 4 | 5 |
+            +---+---+
+            with
+                ne[0] = 2
+                ne[1] = 3
+            there are different dimension order between ggml tensor and qnn tensor
+
+          2. QNN's MatMul can only support input tensors with rank >= 2
+
+             in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
+             operation when offloading mulmat to QNN backend. this implementation will handle transpose
+             in func ggml_qnn_create_general_tensor()
+ *
+ *        this function is a good example to illustrated the second technical approach "mapping the
+ *        entire ggml computational graph to QNN graph" without complex C++ encapsulation. or another
+ *        pipeline of "how to utilize the Hexagon NPU maximally through QNN SDK", details could be found at
+ *        https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
+ *
+ * @param ctx     the context of ggml-qnn backend
  * @param op      the destination tensor where the result of the matrix multiplication will be stored.
  *
  * @note the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
@@ -494,7 +436,6 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
 */
 void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
-    bool graph_initialized                      = false;
     qnn_perf op_perf                            = qnn_perf("ggml_qnn_mul_mat");
     qnn_instance * instance                     = nullptr;
     Qnn_GraphHandle_t graph_handle              = nullptr;
@@ -523,10 +464,12 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     void * wdata                                = ggmlqnn_type_trait(ctx, op);
     const size_t desired_size                   = ctx->desired_size;
 
+    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
+
     std::string graph_name;
     ggmlqnn_get_graphkey_from_op(op, graph_name);
     if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
-        graph_initialized       = true;
+        //retrieve computational resource from cached QNN graph
         qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
         graph_handle            = std::get<0>(graph_item);
         qnn_tensors_t & tensors = std::get<1>(graph_item);
@@ -536,144 +479,55 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         p_param_tensor          = tensors[3];
         p_tensor2_transpose     = tensors[4];
     } else {
-        p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor1 = GQCGT(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor2 = GQCGT(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-    }
-    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
-
-    //ensure QNN tensor has correct tensor type
-    QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor1)->type = QNN_TENSOR_TYPE_APP_WRITE;
-    QNN_VER_PTR(*p_tensor2)->type = QNN_TENSOR_TYPE_APP_READ;
-
-    //save the original dimensions of qnn tensors
-    uint32_t * tensor_0_dimensions = QNN_VER_PTR(*p_tensor0)->dimensions;
-    uint32_t * tensor_1_dimensions = QNN_VER_PTR(*p_tensor1)->dimensions;
-    uint32_t * tensor_2_dimensions = QNN_VER_PTR(*p_tensor2)->dimensions;
-
-    if (!graph_initialized) {
+        //create QNN graph
         GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
-        /*
-         there are two key-points in properly handling how to offload mulmat to the QNN backend in ggml-qnn
-         1. transpose
-            a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from:
-            struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
-            which like this:
-            +---+---+
-            | 0 | 1 |
-            +---+---+
-            | 2 | 3 |
-            +---+---+
-            | 4 | 5 |
-            +---+---+
-            with
-                ne[0] = 2
-                ne[1] = 3
-            there are different dimension order between ggml tensor and qnn tensor
-
-          2. QNN's MatMul can only support input tensors with rank >= 2
-
-             in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
-             operation when offloading mulmat to QNN backend. this concise implementation will handle
-             transpose in func ggml_qnn_create_general_tensor()
-        */
-        //step-1: create qnn graph
-        error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
-                                              graph_name.c_str(), nullptr, &graph_handle);
+        error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle);
         if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_INFO("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
+            GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
             return;
         }
-        //step-2: create param tensor for mulmat of 2d/3d/4d matrix
+
+        //create computational tensor
+        p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor1 = GQCGT(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor2 = GQCGT(dst, nullptr, QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
+
+        //create param tensor for offload 2d/3d/4d matrix multiplication
         const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
                 {0},
                 {1, 0},
                 {0, 2, 1},
                 {0, 1, 3, 2},
         };
-        uint32_t param_tensor_dims[1]   = {src0_rank};
+        uint32_t param_tensor_dims[1] = {src0_rank};
         p_param_tensor = GQCGT(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, param_tensor_dims, (void *)(param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
 
-        //step-3: create compute tensor from ggml tensor
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
-        if (src0_type != GGML_TYPE_F32) {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
-        } else {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-        }
-        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
-
-        //step-4: create a transpose tensor
+        //create transpose tensor
         p_tensor2_transpose = GQCGT(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0, true);
         CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2_transpose));
 
-        //step-5: compose qnn graph: add mat_mul node
-        Qnn_Param_t out_0_params[] = {
-                {QNN_PARAMTYPE_SCALAR,
-                 QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1,
-                        .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}
-                }
-        };
-
+        //compose QNN graph: add mulmat node
+        Qnn_Param_t out_0_params[]   = {{QNN_PARAMTYPE_SCALAR, QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}}};
         Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0, *p_tensor1};
         Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
-#if 0 //leave here for easily understand code, can be removed in the future
-        Qnn_OpConfig_t out_0 = {
-                QNN_OPCONFIG_VERSION_1, .v1 =
-                        {"ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
-                         1,
-                         out_0_params,
-                         2,
-                         out_0_inputs,
-                         1,
-                         out_0_outputs}
-        };
-#else
-        Qnn_OpConfig_t out_0 = ggmlqnn_create_op_config("ggmlqnn_mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL,
-                                                        out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
-#endif
+        Qnn_OpConfig_t out_0         = ggmlqnn_create_op_config("mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
 
-        //step-5: compose qnn graph: add transpose node
-        Qnn_Param_t out_trans1_0_params[] = {
-                {QNN_PARAMTYPE_TENSOR,
-                 "perm", .tensorParam = *p_param_tensor
-                }
-        };
+        //compose QNN graph: add transpose node
+        Qnn_Param_t out_trans1_0_params[]   = { {QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_param_tensor}};
         Qnn_Tensor_t out_trans1_0_inputs[]  = {*p_tensor2_transpose};
         Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
-#if 0 //leave here for easily understand code, can be removed in the future
-        Qnn_OpConfig_t out_trans1_0 = {
-                QNN_OPCONFIG_VERSION_1,
-                .v1 =  {"ggmlqnn_mulmat_transpose_opconfig",
-                        QNN_OP_PACKAGE_NAME_QTI_AISW,
-                        QNN_OP_TRANSPOSE, 1,
-                        out_trans1_0_params,
-                        1,
-                        out_trans1_0_inputs,
-                        1,
-                        out_trans1_0_outputs}
-        };
-#else
-        Qnn_OpConfig_t out_trans1_0 = ggmlqnn_create_op_config("ggmlqnn_mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE,
-                                                               out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
-#endif
+        Qnn_OpConfig_t out_trans1_0         = ggmlqnn_create_op_config("mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
 
-        //step-6: finalize qnn graph and execute qnn graph
+        //finalize QNN graph
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-        Qnn_Tensor_t input_tensors_0[]  = {*p_tensor0, *p_tensor1};
-        Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            input_tensors_0, 2,
-                                                            output_tensors_0, 1,
-                                                            nullptr, nullptr));
 
+        //cache QNN graph
         qnn_tensors_t ggml_op_mulmat_tensors;
         ggml_op_mulmat_tensors.reserve(5);
         ggml_op_mulmat_tensors.push_back(p_tensor0);
@@ -683,35 +537,27 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
         auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
         instance->_qnn_graph_map[graph_name] = graph_item;
-    } else {
-        if (src0_type != GGML_TYPE_F32) {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
-        } else {
-            QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-        }
-        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
-
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *p_tensor2
-        };
-        // this is the second technical approach or another pipeline of "how to utilize the Hexagon
-        // NPU maximally" through QNN SDK, details could be found at
-        // https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
-        CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                            tensor_inputs, 2,
-                                                            tensor_outputs, 1,
-                                                            nullptr, nullptr));
     }
 
-    // restore the original dimensions of qnn tensors to avoid memory leak in func free_qnn_tensor
-    QNN_VER_PTR(*p_tensor0)->dimensions = tensor_0_dimensions;
-    QNN_VER_PTR(*p_tensor1)->dimensions = tensor_1_dimensions;
-    QNN_VER_PTR(*p_tensor2)->dimensions = tensor_2_dimensions;
+    if (src0_type != GGML_TYPE_F32) {
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
+    } else {
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+    }
+    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+    QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+
+    Qnn_Tensor_t tensor_inputs[] = {
+            *p_tensor0,
+            *p_tensor1
+    };
+    Qnn_Tensor_t tensor_outputs[] = {
+            *p_tensor2
+    };
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                        tensor_inputs, 2,
+                                                        tensor_outputs, 1,
+                                                        nullptr, nullptr));
     op_perf.info();
 }
 
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index a12065f306e56..abfe9135fbf4b 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -1023,16 +1023,21 @@ Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const c
     return p_qnn_tensor;
 }
 
-Qnn_Tensor_t * ggmlqnn_create_compute_tensor(const ggml_tensor * tensor) {
-    uint32_t dimensions[] = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
-                             (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
+Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle, const ggml_tensor * tensor, Qnn_TensorType_t tensor_type) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    uint32_t dimensions[]   = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
+                               (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
     Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
     Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
 
-    if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
-        qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
-    } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
-        qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
+    if (0 == tensor->flags) {
+        qnn_tensor_type = tensor_type;
+    } else {
+        if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
+            qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+        } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
+            qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
+        }
     }
 
     qnn_data_type = ggmlqnn_datatype_from_ggml_datatype(tensor->type);
@@ -1041,6 +1046,14 @@ Qnn_Tensor_t * ggmlqnn_create_compute_tensor(const ggml_tensor * tensor) {
                                   ggml_n_dims(tensor), dimensions,
                                   nullptr, 0);
 
+    bool enable_npu_rpc = (instance->enable_qnn_rpc() && instance->get_device_id() == QNN_BACKEND_NPU);
+    if (enable_npu_rpc) {
+        QNN_VER_PTR(*p_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+        QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {.data=nullptr, .dataSize=0};
+    }
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = instance->get_qnn_raw_interface();
+    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_qnn_tensor));
+
     return p_qnn_tensor;
 }
 

From b5c694bf76e977e1ea592c1b8022139d3898a19b Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 6 Mar 2025 12:06:17 +0800
Subject: [PATCH 120/200] ggml-qnn: Windows port --- step4

---
 common/console.cpp                   |   4 +-
 ggml/src/ggml-qnn/CMakeLists.txt     |   2 +
 ggml/src/ggml-qnn/ggml-qnn-impl.h    |   8 +-
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp   |   2 +-
 ggml/src/ggml-qnn/ggml-qnn.cpp       | 171 +++++++++++++------
 scripts/build-run-android-minimal.sh | 240 ---------------------------
 scripts/build-run-android.sh         |  98 +----------
 scripts/build-run-windows.sh         | 208 +++++++++++++++++++++++
 src/llama-mmap.cpp                   |   8 +-
 9 files changed, 346 insertions(+), 395 deletions(-)
 delete mode 100755 scripts/build-run-android-minimal.sh
 create mode 100755 scripts/build-run-windows.sh

diff --git a/common/console.cpp b/common/console.cpp
index 078a8d678d933..73b00aa95de9f 100644
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -241,7 +241,9 @@ namespace console {
         (void)codepoint;
         return 1;
 #else
-        return wcwidth(codepoint);
+        //return wcwidth(codepoint);
+        (void)codepoint;
+        return 1;
 #endif
     }
 
diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt
index 1156c98fbc9d7..8cb75f6cc6fc8 100644
--- a/ggml/src/ggml-qnn/CMakeLists.txt
+++ b/ggml/src/ggml-qnn/CMakeLists.txt
@@ -6,6 +6,8 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
     set(QNN_DEFAULT_LIB_SEARCH_PATH "C:\\" CACHE STRING "customized library search path for QNN backend")
+elseif(CMAKE_SYSTEM_NAME STREQUAL "CYGWIN")
+    set(QNN_DEFAULT_LIB_SEARCH_PATH "/cygdrive/c/qairt/2.31.0.250130/" CACHE STRING "customized library search path for QNN backend")
 else()
     message(FATAL_ERROR "QNN now only available on Android and Windows(Windows on ARM)")
 endif()
diff --git a/ggml/src/ggml-qnn/ggml-qnn-impl.h b/ggml/src/ggml-qnn/ggml-qnn-impl.h
index a0a1a80cbf855..9d0bf559dd7e2 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-impl.h
+++ b/ggml/src/ggml-qnn/ggml-qnn-impl.h
@@ -64,8 +64,9 @@
 #include "android/log.h"
 #endif
 
-#if defined(_WIN32)
+#if !defined(__ANDROID__) && !defined(__linux__)
 #include <wchar.h>
+#include <malloc.h>
 #include <Windows.h>
 #endif
 
@@ -141,7 +142,8 @@ void   ggmlqnn_log_internal(ggml_log_level level, const char * file, const char
 
 #define GQCGT                                   ggmlqnn_create_general_tensor
 
-#if defined(_WIN32)
+//#if defined(_WIN32)
+#if !defined(__ANDROID__) && !defined(__linux__)
 #define RTLD_GLOBAL 0x100
 #define RTLD_LOCAL  0x000
 #define RTLD_LAZY   0x000
@@ -188,7 +190,7 @@ enum qcom_chipset_soc_model {
     SM8550 = 43,  // v73, SD 8 Gen 2
     SM8650 = 57,  // v75, SD 8 Gen 3
     SM8750 = 69,  // v79, SD 8 Gen 4
-#if defined(_MSC_VER)
+#if !defined(__ANDROID__) && !defined(__linux__)
     SC7280X     = 44,
     SC8280X     = 37,
     SC8380XP    = 60,
diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
index 8a4ed15529b4c..6ade24315f99a 100644
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
@@ -124,7 +124,7 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
                 *p_tensor2
         };
         Qnn_OpConfig_t op_config = {
-                QNN_OPCONFIG_VERSION_1, .v1 = {
+                QNN_OPCONFIG_VERSION_1, {
                         ggml_op_name,
                         QNN_OP_PACKAGE_NAME_QTI_AISW,
                         qnn_op_name,
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index abfe9135fbf4b..35b565c7d7669 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -53,6 +53,9 @@ void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char *
     static char s_ggmlqnn_log_internal_buf[GGML_QNN_LOGBUF_LEN];
 
     GGML_UNUSED(file);
+#if !(defined __ANDROID__) || !(defined ANDROID)
+    GGML_UNUSED(level);
+#endif
     {
         std::lock_guard<std::mutex> lock(ggmlqnn_log_internal_mutex);
         va_list args;
@@ -78,7 +81,7 @@ void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char *
 // =================================================================================================
 //  section-3: general helper macro / data structure / function
 // =================================================================================================
-#if defined(_WIN32)
+#if !defined(__ANDROID__) && !defined(__linux__)
 static const char * last_func = nullptr;
 static long last_err;
 void * dlopen(const char * dll, int flags) {
@@ -121,6 +124,42 @@ const char * dlerror(void) {
 }
 #endif
 
+#define GGMLQNN_MEM_ADD(alignment)              (sizeof (size_t) + alignment)
+#define GGMLQNN_MEM_MASK(alignment)             ((uintptr_t)alignment - 1)
+static void * ggmlqnn_mallocz_aligned(size_t size, size_t alignment) {
+    uint8_t * buffer    = NULL;
+    size_t * sp         = NULL;
+    buffer = static_cast<uint8_t *>(calloc(1, size + GGMLQNN_MEM_ADD(alignment)));
+    if (!buffer)
+        return NULL;
+    sp = (size_t *)buffer;
+    *sp = size;
+    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
+    buffer[-1] = buffer - (uint8_t *)sp;
+    return buffer;
+}
+
+static void * ggmlqnn_malloc_aligned(size_t size, size_t alignment) {
+    uint8_t * buffer = NULL;
+    size_t * sp = NULL;
+    buffer = static_cast<uint8_t *>(malloc(size + GGMLQNN_MEM_ADD(alignment)));
+    if (!buffer)
+        return NULL;
+    sp = (size_t *)buffer;
+    *sp = size;
+    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
+    buffer[-1] = buffer - (uint8_t *)sp;
+    return buffer;
+}
+
+static void ggmqnn_free_aligned(void * ptr) {
+    uint8_t * old = (uint8_t *)ptr;
+    if (!old)
+        return;
+    old -= old[-1];
+    free(old);
+}
+
 static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
     return offset % alignment == 0 ? offset
                                    : offset +
@@ -134,15 +173,20 @@ static size_t get_system_total_memory_in_bytes() {
     if (0 == sysinfo(&info)) {
         return (info.totalram + info.totalswap) * info.mem_unit;
     }
-    auto pages = (size_t)sysconf(_SC_PHYS_PAGES);
-    auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
+    size_t pages      = (size_t)sysconf(_SC_PHYS_PAGES);
+    size_t page_size  = (size_t)sysconf(_SC_PAGE_SIZE);
 
     return pages * page_size;
-#elif defined(_WIN32)
-    //TODO: Snapdragon based WoA(Windows on ARM)
-    return 0;
 #else
-#error "ggml-qnn only support WoA, Android, Linux"
+    //FIXME: Snapdragon based WoA(Windows on ARM)
+    MEMORYSTATUSEX statex;
+    statex.dwLength = sizeof(statex);
+    if (GlobalMemoryStatusEx(&statex)) {
+        GGMLQNN_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20);
+        GGMLQNN_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20);
+        return statex.ullTotalPhys;
+    }
+    return 0;
 #endif
 }
 
@@ -152,15 +196,20 @@ static size_t get_system_free_memory_in_bytes() {
     if (0 == sysinfo(&info)) {
         return (info.freeram + info.freeswap) * info.mem_unit;
     }
-    auto avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES);
-    auto page_size = (size_t)sysconf(_SC_PAGE_SIZE);
+    size_t avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES);
+    size_t page_size   = (size_t)sysconf(_SC_PAGE_SIZE);
 
     return avail_pages * page_size;
-#elif defined(_WIN32)
-    //TODO: Snapdragon based WoA(Windows on ARM)
-    return 0;
 #else
-#error "ggml-qnn only support WoA, Android, Linux"
+    //FIXME: Snapdragon based WoA(Windows on ARM)
+    MEMORYSTATUSEX statex;
+    statex.dwLength = sizeof(statex);
+    if (GlobalMemoryStatusEx(&statex)) {
+        GGMLQNN_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20);
+        GGMLQNN_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20);
+        return statex.ullAvailPhys;
+    }
+    return 0;
 #endif
 }
 
@@ -176,22 +225,29 @@ static size_t ggmlqnn_memscpy(void * dst, size_t dst_size, const void * src, siz
 }
 
 static char * ggmlqnn_strndup(const char * source, size_t maxlen) {
+#if defined(__ANDROID__) || defined(__linux__)
     return strndup(source, maxlen);
+#else
+    //FIXME:behaviour is not exactly same to Android&Linux
+    GGML_UNUSED(maxlen);
+    return strdup(source);
+#endif
 }
 
-static void * ggmlqnn_host_malloc(size_t n) {
-#if defined(__ANDROID__) || defined(__linux__)
+static void * ggmlqnn_host_malloc(size_t buffer_size, size_t page_size) {
     void * data = nullptr;
-    int result = posix_memalign((void **)&data, sysconf(_SC_PAGESIZE), n);
+#if defined(__ANDROID__) || defined(__linux__)
+    int result = posix_memalign((void **)&data, page_size, buffer_size);
     if (result != 0) {
         GGMLQNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__);
         return nullptr;
     }
-#elif defined(_WIN32)
-    //TODO: Snapdragon based WoA(Windows on ARM)
-    return nullptr;
 #else
-#error "ggml-qnn only support WoA, Android, Linux"
+    //GGMLQNN_LOG_DEBUG("buffer_size %d, page_size %d\n", buffer_size, page_size);
+    data = ggmlqnn_malloc_aligned(buffer_size, page_size);
+    if (nullptr == data) {
+        GGMLQNN_LOG_WARN("%s: error: host_malloc failed\n", __func__);
+    }
 #endif
 
     return data;
@@ -566,71 +622,71 @@ Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package,
 //file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
 static struct qcom_socinfo g_qnn_soc_info_table[] = {
         /* Qualcomm SnapDragon 7 Gen 1 */
-        [SM7450] = {
+        {
                 .soc_model         = SM7450,
                 .htp_arch          = V69,
                 .vtcm_size_in_mb   = 8,
                 .soc_desc          = "Qualcomm SnapDragon 7 Gen 1"},
 
         /* Qualcomm SnapDragon 888 */
-        [SM8350] = {
+        {
                 .soc_model         = SM8350,
                 .htp_arch          = V68,
                 .vtcm_size_in_mb   = 8,
                 .soc_desc          = "Qualcomm SnapDragon 888 "},
 
         /* Qualcomm SnapDragon 8 Gen 1 */
-        [SM8450] = {
+        {
                 .soc_model         = SM8450,
                 .htp_arch          = V69,
                 .vtcm_size_in_mb   = 8,
                 .soc_desc          = "Qualcomm SnapDragon 8 Gen 1"},
 
         /* Qualcomm SnapDragon 8 Gen 1+ */
-        [SM8475] = {
+        {
                 .soc_model         = SM8475,
                 .htp_arch          = V69,
                 .vtcm_size_in_mb   = 8,
                 .soc_desc          = "Qualcomm SnapDragon 8 Gen 1+"},
 
         /* Qualcomm SnapDragon 8 Gen 2 */
-        [SM8550] = {
+        {
                 .soc_model         = SM8550,
                 .htp_arch          = V73,
                 .vtcm_size_in_mb   = 8,
                 .soc_desc          = "Qualcomm SnapDragon 8 Gen 2"},
 
         /* Qualcomm SnapDragon 8 Gen 3 */
-        [SM8650] = {
+        {
                 .soc_model         = SM8650,
                 .htp_arch          = V75,
                 .vtcm_size_in_mb   = 8,
                 .soc_desc          = "Qualcomm SnapDragon 8 Gen 3 "},
 
         /* Qualcomm SnapDragon 8 Gen 4 */
-        [SM8750] = {
+        {
                 .soc_model         = SM8750,
                 .htp_arch          = V79,
                 .vtcm_size_in_mb   = 8,
                 .soc_desc          = "Qualcomm SnapDragon 8 Gen 4"},
 
-#if defined(_WIN32)
+#if !defined(__ANDROID__) && !defined(__linux__)
         /* Qualcomm SnapDragon 7c Gen 2 */
-        [SC7280X] = {
+        {
                 .soc_model         = SC7280X,
                 .htp_arch          = V68,
                 .vtcm_size_in_mb   = 8,
                 .soc_desc          = "Qualcomm SnapDragon 7c Gen 2"},
 
         /* Qualcomm SnapDragon 8cx Gen 3 */
-        [SC8280X] = {
+        {
                 .soc_model         = SC8280X,
                 .htp_arch          = V68,
                 .vtcm_size_in_mb   = 8,
                 .soc_desc          = "Qualcomm SnapDragon 8cx Gen 3"},
 
         /* Qualcomm SnapDragon 8cx Gen 4 */
-        [SC8380XP] = {
+        {
                 .soc_model         = SC8380XP,
                 .htp_arch          = V73,
                 .vtcm_size_in_mb   = 8,
@@ -639,6 +695,16 @@ static struct qcom_socinfo g_qnn_soc_info_table[] = {
 
 };
 
+
+#if defined(__ANDROID__)
+static const char * g_qnn_runtimelib_path = "/data/local/tmp/";
+#elif defined(__linux__)
+static const char * g_qnn_runtimelib_path = "/tmp/";
+#elif defined(_WIN32)
+static const char * g_qnn_runtimelib_path = "C:\\";
+#else //cygwin on Windows
+static const char * g_qnn_runtimelib_path = "/cygdrive/c/";
+#endif
 //the following helper funcs are used to ensure every QNN tensor name is unique
 static std::atomic<int32_t>  g_ggmltensor_idx(0);
 static void reset_idx() {
@@ -664,7 +730,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .threads              = 1,
                 .name                 = "qnn-cpu",
                 .desc                 = "Qualcomm Kryo CPU",
-#if defined(_WIN32)
+#if !defined(__ANDROID__) && !defined(__linux__)
                 .lib                  = "QnnCpu.dll",
 #else
                 .lib                  = "libQnnCpu.so",
@@ -679,7 +745,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .threads              = 1,
                 .name                 = "qnn-gpu",
                 .desc                 = "Qualcomm Adreno GPU",
-#if defined(_WIN32)
+#if !defined(__ANDROID__) && !defined(__linux__)
                 .lib                  = "QnnGpu.dll",
 #else
                 .lib                  = "libQnnGpu.so",
@@ -694,7 +760,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .threads              = 1,
                 .name                 = "qnn-npu",
                 .desc                 = "Qualcomm NPU(Hexagon Tensor Processor)",
-#if defined(_WIN32)
+#if !defined(__ANDROID__) && !defined(__linux__)
                 .lib                  = "QnnHtp.dll",
 #else
                 .lib                  = "libQnnHtp.so",
@@ -856,7 +922,7 @@ static const char * qnn_get_htparch_desc(size_t htp_arch) {
     }
 }
 
-static struct qcom_socinfo * qnn_get_socinfo_from_socmodel(uint32_t soc_model) {
+static struct qcom_socinfo * ggmlqnn_get_socinfo_from_socmodel(uint32_t soc_model) {
     size_t items = sizeof(g_qnn_soc_info_table) / sizeof(g_qnn_soc_info_table[0]);
     for (size_t idx = 0; idx < items; idx++) {
         if (soc_model == g_qnn_soc_info_table[idx].soc_model) {
@@ -1538,7 +1604,7 @@ int qnn_instance::unload_backend() {
 int qnn_instance::load_system() {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
-#ifdef _WIN32
+#if !defined(__ANDROID__) && !defined(__linux__)
     std::string system_lib_path = _lib_path + "QnnSystem.dll";
 #else
     std::string system_lib_path = _lib_path + "libQnnSystem.so";
@@ -1549,8 +1615,8 @@ int qnn_instance::load_system() {
     if (nullptr == _system_lib_handle) {
         GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
         //re-try with default path of QNN binary runtime lib
-        _lib_path = "/data/local/tmp/";
-#ifdef _WIN32
+        _lib_path = std::string(g_qnn_runtimelib_path);
+#if !defined(__ANDROID__) && !defined(__linux__)
         system_lib_path = _lib_path + "QnnSystem.dll";
 #else
         system_lib_path = _lib_path + "libQnnSystem.so";
@@ -1804,10 +1870,8 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
 
 #if defined(__ANDROID__) || defined(__linux__)
     _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
-#elif defined(_WIN32)
-    _rpc_lib_handle = dlopen("libcdsprpc.dll", RTLD_NOW | RTLD_LOCAL);
 #else
-#error "ggml-qnn only support WoA, Android, Linux"
+    _rpc_lib_handle = dlopen("libcdsprpc.dll", RTLD_NOW | RTLD_LOCAL);
 #endif
     if (nullptr == _rpc_lib_handle) {
         GGMLQNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror());
@@ -1842,7 +1906,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         GGMLQNN_LOG_DEBUG("initialize qnn context successfully\n");
     }
 
-    if (_backend_name.find("Htp") != std::variant_npos) {
+    if (_backend_name.find("Htp") != std::string::npos) {
         const QnnDevice_PlatformInfo_t * p_info = nullptr;
         _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
         GGMLQNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices);
@@ -1858,7 +1922,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
             GGMLQNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \
                              chipinfo.socModel, qnn_get_socmodel_desc(chipinfo.socModel), \
                              htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize);
-            struct qcom_socinfo * socinfo = qnn_get_socinfo_from_socmodel(chipinfo.socModel);
+            struct qcom_socinfo * socinfo = ggmlqnn_get_socinfo_from_socmodel(chipinfo.socModel);
             g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {}};
             if (nullptr != socinfo) {
                 memcpy(g_qnn_mgr[QNN_BACKEND_NPU].socinfo.soc_desc, socinfo->soc_desc, sizeof(socinfo->soc_desc));
@@ -2546,22 +2610,25 @@ static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(
                                   ggml_backend_buffer_type_t buft, size_t size) {
     ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context;
 
+    size_t size_page = 0;
 #if defined(__ANDROID__) || defined(__linux__)
-    size_t size_page = sysconf(_SC_PAGESIZE);
-#elif defined(_WIN32)
+    size_page = sysconf(_SC_PAGESIZE);
+#else
     SYSTEM_INFO systeminfo;
     GetSystemInfo(&systeminfo);
-    size_t size_page = systeminfo.dwPageSize;
+    size_page = systeminfo.dwPageSize;
 #endif
     size_t size_aligned = size;
     if ((size_aligned % size_page) != 0) {
         size_aligned += (size_page - (size_aligned % size_page));
     }
-    ctx->buffer         = ggmlqnn_host_malloc(size_aligned);
+    ctx->buffer         = ggmlqnn_host_malloc(size_aligned, size_page);
     ctx->buffer_size    = size_aligned;
     if (nullptr == ctx->buffer) {
-        GGMLQNN_LOG_WARN("%s: failed to allocate %.2f MiB\n", __func__, size / (1 << 20));
+        GGMLQNN_LOG_WARN("%s: failed to allocate %d MiB\n", __func__, size / (1 << 20));
         return nullptr;
+    } else {
+        GGMLQNN_LOG_DEBUG("%s: allocate %d MiB\n", __func__, size_aligned / (1 << 20));
     }
 
     return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size);
@@ -2572,11 +2639,10 @@ static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_typ
     return 32;
 }
 
-//TODO:not used currently
 static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
     GGML_UNUSED(buft);
 
-    return (2 * (1 << 20));
+    return (2 * (1 << 29));
 }
 
 static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) {
@@ -2724,8 +2790,7 @@ static ggml_backend_t ggml_backend_qnn_device_init_backend(ggml_backend_dev_t de
     if (nullptr == params) {
         params = 0;
     }
-    ggml_backend_t qnn_backend = ggml_backend_qnn_init((int) (intptr_t) params,
-                                                       "/data/local/tmp/");
+    ggml_backend_t qnn_backend = ggml_backend_qnn_init((int)(intptr_t)params, g_qnn_runtimelib_path);
 
     return qnn_backend;
 
@@ -2867,7 +2932,7 @@ static void * ggml_backend_qnn_reg_get_proc_address(ggml_backend_reg_t reg, cons
 
     const char * slot_name =  "ggml_backend_set_n_threads";
     //avoid buffer attack rather than strcmp
-    if (0 == std::memcmp(name, slot_name, strlen(slot_name))) {
+    if (0 == memcmp(name, slot_name, strlen(slot_name))) {
         return (void *)ggml_backend_qnn_set_n_threads;
     }
     return nullptr;
diff --git a/scripts/build-run-android-minimal.sh b/scripts/build-run-android-minimal.sh
deleted file mode 100755
index 1a5f362fe2083..0000000000000
--- a/scripts/build-run-android-minimal.sh
+++ /dev/null
@@ -1,240 +0,0 @@
-#!/bin/bash
-
-set -e
-
-PWD=`pwd`
-ANDROID_PLATFORM=android-34
-ANDROID_NDK=${PWD}/android-ndk-r26c
-REMOTE_PATH=/data/local/tmp/
-GGUF_MODEL_NAME=/sdcard/deepseek-r1-distill-qwen-1.5b-q4_0.gguf
-GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
-
-#QNN SDK could be found at:
-#https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
-#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
-QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
-QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.31.0.250130/
-
-#default is QNN NPU
-qnnbackend=2
-
-function dump_vars()
-{
-    echo -e "ANDROID_NDK:          ${ANDROID_NDK}"
-    echo -e "QNN_SDK_PATH:         ${QNN_SDK_PATH}"
-}
-
-
-function show_pwd()
-{
-    echo -e "current working path:$(pwd)\n"
-}
-
-
-function check_qnn_sdk()
-{
-    if [ ! -d ${QNN_SDK_PATH} ]; then
-        echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from ${QNN_SDK_URL}...\n"
-        exit 1
-    fi
-}
-
-
-function check_and_download_ndk()
-{
-    is_android_ndk_exist=1
-
-    if [ ! -d ${ANDROID_NDK} ]; then
-        is_android_ndk_exist=0
-    fi
-
-    if [ ! -f ${ANDROID_NDK}/build/cmake/android.toolchain.cmake ]; then
-        is_android_ndk_exist=0
-    fi
-
-    if [ ${is_android_ndk_exist} -eq 0 ]; then
-
-        if [ ! -f android-ndk-r26c-linux.zip ]; then
-            wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip  https://dl.google.com/android/repository/android-ndk-r26c-linux.zip
-        fi
-
-        unzip android-ndk-r26c-linux.zip
-
-        if [ $? -ne 0 ]; then
-            printf "failed to download android ndk to %s \n" "${ANDROID_NDK}"
-            exit 1
-        fi
-
-        printf "android ndk saved to ${ANDROID_NDK} \n\n"
-    else
-        printf "android ndk already exist:${ANDROID_NDK} \n\n"
-    fi
-}
-
-
-function build_arm64
-{
-    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_USE_QNN=ON -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
-    cd out/android
-    make -j16
-    show_pwd
-
-    cd -
-}
-
-
-function remove_temp_dir()
-{
-    if [ -d out ]; then
-        echo "remove out directory in `pwd`"
-        rm -rf out
-    fi
-}
-
-
-function check_qnn_libs()
-{
-    #reuse the cached qnn libs on Android phone
-    adb shell ls ${REMOTE_PATH}/libQnnCpu.so
-    if [ $? -eq 0 ]; then
-        printf "QNN libs already exist on Android phone\n"
-    else
-        update_qnn_libs
-    fi
-}
-
-
-function update_qnn_libs()
-{
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnSystem.so              ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnCpu.so                 ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnGpu.so                 ${REMOTE_PATH}/
-
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so                 ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so          ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so          ${REMOTE_PATH}/
-        adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so     ${REMOTE_PATH}/
-}
-
-
-function build_ggml_qnn()
-{
-    show_pwd
-    check_and_download_ndk
-    check_qnn_sdk
-    dump_vars
-    remove_temp_dir
-    build_arm64
-}
-
-
-function run_llamacli()
-{
-    check_qnn_libs
-
-    if [ -f ./out/android/bin/libggml-qnn.so ]; then
-        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
-    fi
-    adb push ./out/android/bin/llama-cli ${REMOTE_PATH}/
-    adb shell chmod +x ${REMOTE_PATH}/llama-cli
-
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
-
-}
-
-
-function run_llamabench()
-{
-    check_qnn_libs
-
-    if [ -f ./out/android/bin/libggml-qnn.so ]; then
-        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
-    fi
-    adb push ./out/android/bin/llama-bench ${REMOTE_PATH}/
-    adb shell chmod +x ${REMOTE_PATH}/llama-bench
-
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-bench -mg ${qnnbackend} -m ${GGUF_MODEL_NAME}"
-
-}
-
-
-function run_test-backend-ops()
-{
-    check_qnn_libs
-
-    if [ -f ./out/android/bin/libggml-qnn.so ]; then
-        adb push ./out/android/bin/*.so ${REMOTE_PATH}/
-    fi
-    adb push ./out/android/bin/test-backend-ops ${REMOTE_PATH}/
-    adb shell chmod +x ${REMOTE_PATH}/test-backend-ops
-
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/test-backend-ops test"
-
-}
-
-
-function show_usage()
-{
-    echo "Usage:"
-    echo "  $0 build"
-    echo "  $0 updateqnnlib"
-    echo "  $0 run_testop"
-    echo "  $0 run_llamacli     0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo "  $0 run_llamabench   0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo -e "\n\n\n"
-}
-
-
-show_pwd
-
-check_qnn_sdk
-
-if [ $# == 0 ]; then
-    show_usage
-    exit 1
-elif [ $# == 1 ]; then
-    if [ "$1" == "-h" ]; then
-        show_usage
-        exit 1
-    elif [ "$1" == "help" ]; then
-        show_usage
-        exit 1
-    elif [ "$1" == "build" ]; then
-        build_ggml_qnn
-        exit 0
-
-    elif [ "$1" == "run_testop" ]; then
-        run_test-backend-ops
-        exit 0
-    elif [ "$1" == "updateqnnlib" ]; then
-        update_qnn_libs
-        exit 0
-    else
-        show_usage
-        exit 1
-    fi
-elif [ $# == 2 ]; then
-    qnnbackend=$2
-    if [ ${qnnbackend} -gt 3 ]; then
-        show_usage
-        exit 1
-    fi
-
-    if [ "$1" == "run_llamacli" ]; then
-        run_llamacli
-        exit 0
-    elif [ "$1" == "run_llamabench" ]; then
-        run_llamabench
-        exit 0
-    fi
-else
-    show_usage
-    exit 1
-fi
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 2ed8db9349003..cc2828389fd16 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -85,9 +85,9 @@ function build_arm64
 
 function remove_temp_dir()
 {
-    if [ -d out ]; then
-        echo "remove out directory in `pwd`"
-        rm -rf out
+    if [ -d out/android ]; then
+        echo "remove out/android directory in `pwd`"
+        rm -rf out/android
     fi
 }
 
@@ -168,6 +168,7 @@ function run_llamabench()
 }
 
 
+#refer to:https://github.com/ggml-org/llama.cpp/pull/12155
 function run_test-ops()
 {
     prepare_run_on_phone test-backend-ops
@@ -397,96 +398,7 @@ elif [ $# == 3 ]; then
     opname=$2
 #TODO: check opname in oplist
 #opname can be found via print_oplist:
-#    DUP
-#    ADD
-#    ADD1
-#    ACC
-#    SUB
-#    MUL
-#    DIV
-#    SQR
-#    SQRT
-#    LOG
-#    SIN
-#    COS
-#    SUM
-#    SUM_ROWS
-#    MEAN
-#    ARGMAX
-#    COUNT_EQUAL
-#    REPEAT
-#    REPEAT_BACK
-#    CONCAT
-#    SILU_BACK
-#    NORM
-#    RMS_NORM
-#    RMS_NORM_BACK
-#    GROUP_NORM
-#
-#    MUL_MAT
-#    MUL_MAT_ID
-#    OUT_PROD
-#
-#    SCALE
-#    SET
-#    CPY
-#    CONT
-#    RESHAPE
-#    VIEW
-#    PERMUTE
-#    TRANSPOSE
-#    GET_ROWS
-#    GET_ROWS_BACK
-#    DIAG
-#    DIAG_MASK_INF
-#    DIAG_MASK_ZERO
-#    SOFT_MAX
-#    SOFT_MAX_BACK
-#    ROPE
-#    ROPE_BACK
-#    CLAMP
-#    CONV_TRANSPOSE_1D
-#    IM2COL
-#    IM2COL_BACK
-#    CONV_TRANSPOSE_2D
-#    POOL_1D
-#    POOL_2D
-#    POOL_2D_BACK
-#    UPSCALE
-#    PAD
-#    PAD_REFLECT_1D
-#    ARANGE
-#    TIMESTEP_EMBEDDING
-#    ARGSORT
-#    LEAKY_RELU
-#
-#    FLASH_ATTN_EXT
-#    FLASH_ATTN_BACK
-#    SSM_CONV
-#    SSM_SCAN
-#    WIN_PART
-#    WIN_UNPART
-#    GET_REL_POS
-#    ADD_REL_POS
-#    RWKV_WKV6
-#    GATED_LINEAR_ATTN
-#
-#    UNARY
-#
-#    MAP_UNARY
-#    MAP_BINARY
-#
-#    MAP_CUSTOM1_F32
-#    MAP_CUSTOM2_F32
-#    MAP_CUSTOM3_F32
-#
-#    MAP_CUSTOM1
-#    MAP_CUSTOM2
-#    MAP_CUSTOM3
-#
-#    CROSS_ENTROPY_LOSS
-#    CROSS_ENTROPY_LOSS_BACK
-#    OPT_STEP_ADAMW
+
     qnnbackend=$3
     if [ ${qnnbackend} -gt 3 ]; then
         show_usage
diff --git a/scripts/build-run-windows.sh b/scripts/build-run-windows.sh
new file mode 100755
index 0000000000000..4c7cad1aeb111
--- /dev/null
+++ b/scripts/build-run-windows.sh
@@ -0,0 +1,208 @@
+#!/bin/bash
+
+set -e
+
+PWD=`pwd`
+PREFIX_PATH=/cygdrive/c
+GGUF_MODEL_NAME=${PREFIX_PATH}/qwen1_5-1_8b-chat-q4_0.gguf
+
+#QNN SDK could be found at:
+#https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
+QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+QNN_SDK_PATH=${PREFIX_PATH}/qairt/2.31.0.250130/
+
+#default is QNN NPU
+qnnbackend=2
+
+function dump_vars()
+{
+    echo -e "QNN_SDK_PATH:         ${QNN_SDK_PATH}"
+}
+
+
+function show_pwd()
+{
+    echo -e "current working path:$(pwd)\n"
+}
+
+
+function check_qnn_sdk()
+{
+    if [ ! -d ${QNN_SDK_PATH} ]; then
+        echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from ${QNN_SDK_URL}...\n"
+        exit 1
+    fi
+}
+
+function build_windows_x86
+{
+    echo "build_windows_x86-without-qnn"
+    cmake -H. -B./out/windows_x86 -DCMAKE_BUILD_TYPE=Release
+    cd out/windows_x86
+    make -j16
+    show_pwd
+
+    cd -
+}
+
+function build_windows_x86_qnn
+{
+    echo "build_windows_x86-with-qnn"
+    cmake -H. -B./out/windows_x86_qnn -DCMAKE_BUILD_TYPE=Release -DGGML_USE_QNN=ON -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
+    cd out/windows_x86_qnn
+    make -j16
+    show_pwd
+
+    cd -
+}
+
+function build_windows_arm64_qnn
+{
+    echo "build_windows_arm64 not supported now"
+    #cmake -H. -B./out/windows_arm64_qnn -DCMAKE_BUILD_TYPE=Release -DGGML_USE_QNN=ON -DCMAKE_TOOLCHAIN_FILE=${MSSDK}/cmake/arm64-windows-llvm.cmake -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
+}
+
+
+function remove_temp_dir()
+{
+    if [ -d out/windows_x86 ]; then
+        echo "remove out/windows_x86 directory in `pwd`"
+        rm -rf out/windows_x86
+    fi
+}
+
+
+function check_qnn_libs()
+{
+    echo "do nothing"
+}
+
+
+function update_qnn_libs()
+{
+    echo "do nothing"
+}
+
+function build_x86()
+{
+    show_pwd
+    check_qnn_sdk
+    dump_vars
+    #some unexpected behaviour on Windows
+    #remove_temp_dir
+    build_windows_x86
+}
+
+function build_x86_qnn()
+{
+    show_pwd
+    check_qnn_sdk
+    dump_vars
+    #some unexpected behaviour on Windows
+    #remove_temp_dir
+    build_windows_x86_qnn
+}
+
+function build_arm64_qnn()
+{
+    show_pwd
+    check_qnn_sdk
+    dump_vars
+    #some unexpected behaviour on Windows
+    #remove_temp_dir
+    build_windows_arm64_qnn
+}
+
+function run_llamacli()
+{
+    check_qnn_libs
+    echo "not supported on Windows now"
+
+    #llama-cli -mg ${qnnbackend} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\"
+
+}
+
+
+function run_llamabench()
+{
+    check_qnn_libs
+    echo "not supported on Windows now"
+
+    #llama-bench -mg ${qnnbackend} -m ${GGUF_MODEL_NAME}"
+
+}
+
+
+function run_test-backend-ops()
+{
+    check_qnn_libs
+    echo "not supported on Windows now"
+
+    #test-backend-ops test"
+
+}
+
+
+function show_usage()
+{
+    echo "Usage:"
+    echo "  $0 build_x86"
+    echo "  $0 build_x86_qnn"
+    echo "  $0 build_arm64_qnn"
+    echo "  $0 run_testop"
+    echo "  $0 run_llamacli     0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_llamabench   0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo -e "\n\n\n"
+}
+
+
+show_pwd
+
+check_qnn_sdk
+
+if [ $# == 0 ]; then
+    show_usage
+    exit 1
+elif [ $# == 1 ]; then
+    if [ "$1" == "-h" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "help" ]; then
+        show_usage
+        exit 1
+    elif [ "$1" == "build_x86" ]; then
+        build_x86
+        exit 0
+    elif [ "$1" == "build_x86_qnn" ]; then
+        build_x86_qnn
+        exit 0
+    elif [ "$1" == "build_arm64_qnn" ]; then
+        build_arm64_qnn
+        exit 0
+
+    elif [ "$1" == "run_testop" ]; then
+        run_test-backend-ops
+        exit 0
+    else
+        show_usage
+        exit 1
+    fi
+elif [ $# == 2 ]; then
+    qnnbackend=$2
+    if [ ${qnnbackend} -gt 3 ]; then
+        show_usage
+        exit 1
+    fi
+
+    if [ "$1" == "run_llamacli" ]; then
+        run_llamacli
+        exit 0
+    elif [ "$1" == "run_llamabench" ]; then
+        run_llamabench
+        exit 0
+    fi
+else
+    show_usage
+    exit 1
+fi
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 9da97f1bc5057..7345eee2ea989 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -481,10 +481,10 @@ struct llama_mlock::impl {
         // Skip resource limit checks on visionOS/tvOS
         suggest = false;
 #else
-        struct rlimit lock_limit;
-        if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
-            suggest = false;
-        }
+        struct rlimit lock_limit = {};
+        //if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
+        //    suggest = false;
+        //}
         if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
             suggest = false;
         }

From eabb91164c4e3bdce2ef26ca1f1d60aedae7b9f7 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Fri, 7 Mar 2025 15:03:52 +0800
Subject: [PATCH 121/200] ggml-qnn: Windows port -- step5

---
 cmake/aarch64-w64-mingw32.cmake  | 18 ++++++++++++++++++
 ggml/src/ggml-qnn/CMakeLists.txt |  1 +
 scripts/build-run-android.sh     |  1 +
 scripts/build-run-windows.sh     | 20 +++++++++++++++++---
 4 files changed, 37 insertions(+), 3 deletions(-)
 create mode 100644 cmake/aarch64-w64-mingw32.cmake

diff --git a/cmake/aarch64-w64-mingw32.cmake b/cmake/aarch64-w64-mingw32.cmake
new file mode 100644
index 0000000000000..775fa46337628
--- /dev/null
+++ b/cmake/aarch64-w64-mingw32.cmake
@@ -0,0 +1,18 @@
+#TODO
+#not work on Linux
+set( CMAKE_SYSTEM_NAME mingw )
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target aarch64-w64-mingw32 )
+
+set( CMAKE_C_COMPILER    aarch64-w64-mingw32-gcc )
+set( CMAKE_CXX_COMPILER  aarch64-w64-mingw32-g++ )
+
+set( CMAKE_C_COMPILER_TARGET   ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
+
+#set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+#set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
+
+set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt
index 8cb75f6cc6fc8..c11e2f82fa92b 100644
--- a/ggml/src/ggml-qnn/CMakeLists.txt
+++ b/ggml/src/ggml-qnn/CMakeLists.txt
@@ -23,6 +23,7 @@ endif()
 
 message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}")
 
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_QNN")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
 
 file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index cc2828389fd16..5e69024298dbe 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# build llama.cpp + ggml-qnn for Snapdragon mobile SoC equipped Android phone on Linux
 
 set -e
 
diff --git a/scripts/build-run-windows.sh b/scripts/build-run-windows.sh
index 4c7cad1aeb111..8221293a431d4 100755
--- a/scripts/build-run-windows.sh
+++ b/scripts/build-run-windows.sh
@@ -1,10 +1,16 @@
 #!/bin/bash
+# build llama.cpp or llama.cpp + ggml-qnn for Windows with cygwin on Windows
+# build llama.cpp + ggml-qnn for Snapdragon desktop SoC equipped WoA(Windows on ARM) with cygwin on Windows
+
+# items marked TODO has not verified yet
 
 set -e
 
+
 PWD=`pwd`
 PREFIX_PATH=/cygdrive/c
 GGUF_MODEL_NAME=${PREFIX_PATH}/qwen1_5-1_8b-chat-q4_0.gguf
+PROJECT_HOME_PATH=`pwd`
 
 #QNN SDK could be found at:
 #https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
@@ -38,7 +44,7 @@ function check_qnn_sdk()
 function build_windows_x86
 {
     echo "build_windows_x86-without-qnn"
-    cmake -H. -B./out/windows_x86 -DCMAKE_BUILD_TYPE=Release
+    cmake -H. -B./out/windows_x86 -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF
     cd out/windows_x86
     make -j16
     show_pwd
@@ -49,7 +55,7 @@ function build_windows_x86
 function build_windows_x86_qnn
 {
     echo "build_windows_x86-with-qnn"
-    cmake -H. -B./out/windows_x86_qnn -DCMAKE_BUILD_TYPE=Release -DGGML_USE_QNN=ON -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
+    cmake -H. -B./out/windows_x86_qnn -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
     cd out/windows_x86_qnn
     make -j16
     show_pwd
@@ -57,10 +63,18 @@ function build_windows_x86_qnn
     cd -
 }
 
+#TODO
 function build_windows_arm64_qnn
 {
     echo "build_windows_arm64 not supported now"
-    #cmake -H. -B./out/windows_arm64_qnn -DCMAKE_BUILD_TYPE=Release -DGGML_USE_QNN=ON -DCMAKE_TOOLCHAIN_FILE=${MSSDK}/cmake/arm64-windows-llvm.cmake -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
+    return 0
+    echo "cmake source dir:${PROJECT_HOME_PATH}"
+    cmake -H. -B./out/windows_arm64_qnn -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DGGML_QNN=ON -DCMAKE_TOOLCHAIN_FILE=${PROJECT_HOME_PATH}/cmake/arm64-windows-llvm.cmake -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
+    cd out/windows_arm64_qnn
+    make -j16
+    show_pwd
+
+    cd -
 }
 
 
From 630f1540536ab98d8df4c4c77f35fa26614e1be8 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sat, 8 Mar 2025 08:09:02 +0800
Subject: [PATCH 122/200] ggml-qnn: WoA(Windows on ARM) -- step6

---
 cmake/arm64-windows-cygwin.cmake | 16 ++++++++++++++++
 cmake/arm64-windows-llvm.cmake   |  4 ++--
 scripts/build-run-windows.sh     |  2 +-
 3 files changed, 19 insertions(+), 3 deletions(-)
 create mode 100644 cmake/arm64-windows-cygwin.cmake

diff --git a/cmake/arm64-windows-cygwin.cmake b/cmake/arm64-windows-cygwin.cmake
new file mode 100644
index 0000000000000..c7a313bb77adf
--- /dev/null
+++ b/cmake/arm64-windows-cygwin.cmake
@@ -0,0 +1,16 @@
+set( CMAKE_SYSTEM_NAME CYGWIN)
+set( CMAKE_SYSTEM_PROCESSOR arm64 )
+
+set( target aarch64-w64-cygwin)
+
+set( CMAKE_C_COMPILER    clang )
+set( CMAKE_CXX_COMPILER  clang++ )
+
+set( CMAKE_C_COMPILER_TARGET   ${target} )
+set( CMAKE_CXX_COMPILER_TARGET ${target} )
+
+set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
+
+set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
+set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
diff --git a/cmake/arm64-windows-llvm.cmake b/cmake/arm64-windows-llvm.cmake
index 8023796800683..983206032df3d 100644
--- a/cmake/arm64-windows-llvm.cmake
+++ b/cmake/arm64-windows-llvm.cmake
@@ -9,8 +9,8 @@ set( CMAKE_CXX_COMPILER  clang++ )
 set( CMAKE_C_COMPILER_TARGET   ${target} )
 set( CMAKE_CXX_COMPILER_TARGET ${target} )
 
-set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
-set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
+#set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+#set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
 
 set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
 set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
diff --git a/scripts/build-run-windows.sh b/scripts/build-run-windows.sh
index 8221293a431d4..c9a5b13d71d4c 100755
--- a/scripts/build-run-windows.sh
+++ b/scripts/build-run-windows.sh
@@ -67,9 +67,9 @@ function build_windows_x86_qnn
 function build_windows_arm64_qnn
 {
     echo "build_windows_arm64 not supported now"
-    return 0
     echo "cmake source dir:${PROJECT_HOME_PATH}"
     cmake -H. -B./out/windows_arm64_qnn -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DGGML_QNN=ON -DCMAKE_TOOLCHAIN_FILE=${PROJECT_HOME_PATH}/cmake/arm64-windows-llvm.cmake -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
+    #cmake -H. -B./out/windows_arm64_qnn -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DGGML_QNN=ON -DCMAKE_TOOLCHAIN_FILE=${PROJECT_HOME_PATH}/cmake/arm64-windows-cygwin.cmake -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
     cd out/windows_arm64_qnn
     make -j16
     show_pwd

From 47042eecaf8fc599c34ffe472d1efc38ef01511b Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 9 Mar 2025 14:22:50 +0800
Subject: [PATCH 123/200] ggml-qnn: rebase to upstream

---
 common/console.cpp                   |    4 +-
 examples/export-lora/export-lora.cpp |    2 +-
 ggml/src/ggml-qnn/ggml-qnn-impl.h    |  617 ---------
 ggml/src/ggml-qnn/ggml-qnn-ops.cpp   |  687 ----------
 ggml/src/ggml-qnn/ggml-qnn-ops.h     |   52 -
 ggml/src/ggml-qnn/ggml-qnn.cpp       | 1751 ++++++++++++++++++++++----
 scripts/build-run-android.sh         |   54 +-
 scripts/build-run-windows.sh         |  222 ----
 src/llama-mmap.cpp                   |    8 +-
 9 files changed, 1522 insertions(+), 1875 deletions(-)
 delete mode 100644 ggml/src/ggml-qnn/ggml-qnn-impl.h
 delete mode 100644 ggml/src/ggml-qnn/ggml-qnn-ops.cpp
 delete mode 100644 ggml/src/ggml-qnn/ggml-qnn-ops.h
 delete mode 100755 scripts/build-run-windows.sh

diff --git a/common/console.cpp b/common/console.cpp
index 73b00aa95de9f..078a8d678d933 100644
--- a/common/console.cpp
+++ b/common/console.cpp
@@ -241,9 +241,7 @@ namespace console {
         (void)codepoint;
         return 1;
 #else
-        //return wcwidth(codepoint);
-        (void)codepoint;
-        return 1;
+        return wcwidth(codepoint);
 #endif
     }
 
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp
index f038019b007b4..24dc85cf27336 100644
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -148,7 +148,7 @@ struct lora_merge_ctx {
 
         ctx_out = gguf_init_empty();
         struct ggml_init_params params = {
-            /*.mem_size   =*/ static_cast<size_t>(gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead()),
+            /*.mem_size   =*/ gguf_get_n_tensors(base_model.ctx_gguf)*ggml_tensor_overhead(),
             /*.mem_buffer =*/ NULL,
             /*.no_alloc   =*/ true,
         };
diff --git a/ggml/src/ggml-qnn/ggml-qnn-impl.h b/ggml/src/ggml-qnn/ggml-qnn-impl.h
deleted file mode 100644
index 9d0bf559dd7e2..0000000000000
--- a/ggml/src/ggml-qnn/ggml-qnn-impl.h
+++ /dev/null
@@ -1,617 +0,0 @@
-/*
-* Copyright (c) 2023-2024 The ggml authors
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to
-* deal in the Software without restriction, including without limitation the
-* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-* sell copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*/
-#pragma once
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <string.h>
-#include <stddef.h>
-#include <inttypes.h>
-#include <math.h>
-#include <time.h>
-#if defined(__ANDROID__) || defined(__linux__)
-#include <unistd.h>
-#include <dlfcn.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/sysinfo.h>
-#include <unistd.h>
-#endif
-
-#include <string>
-#include <vector>
-#include <thread>
-#include <mutex>
-#include <map>
-#include <set>
-#include <tuple>
-#include <queue>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <chrono>
-#include <memory>
-#include <regex>
-#include <random>
-#include <functional>
-#include <unordered_map>
-#include <condition_variable>
-#include <cassert>
-#include <unordered_set>
-#include <utility>
-#include <stdatomic.h>
-#include <future>
-#if (defined __ANDROID__) || (defined ANDROID)
-#include "android/log.h"
-#endif
-
-#if !defined(__ANDROID__) && !defined(__linux__)
-#include <wchar.h>
-#include <malloc.h>
-#include <Windows.h>
-#endif
-
-#include "QnnTypes.h"
-#include "QnnCommon.h"
-#include "QnnContext.h"
-#include "QnnBackend.h"
-#include "QnnGraph.h"
-#include "QnnProperty.h"
-#include "QnnTensor.h"
-#include "QnnInterface.h"
-#include "Saver/QnnSaver.h"
-#include "System/QnnSystemInterface.h"
-#include "HTP/QnnHtpDevice.h"
-#include "HTP/QnnHtpGraph.h"
-
-#include "ggml-qnn.h"
-#include "ggml-impl.h"
-#include "ggml-backend-impl.h"
-
-class  qnn_instance;
-struct ggml_backend_qnn_context;
-void   ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
-
-#if 0//def NDEBUG
-#define GGMLQNN_DEBUG                           0
-#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
-#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
-#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
-#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            0
-#else
-#define GGMLQNN_DEBUG                           1  // for troubleshooting QNN backend
-#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
-#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
-#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
-#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            1
-#endif
-#define GGML_QNN_LOGBUF_LEN                     4096
-
-#define GGMLQNN_LOG_ERROR(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLQNN_LOG_WARN(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLQNN_LOG_INFO(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-
-#if GGMLQNN_DEBUG
-#define GGMLQNN_LOG_DEBUG(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#else
-#define GGMLQNN_LOG_DEBUG(...)
-#endif
-
-#define CHECK_QNN_API(error, result)                                            \
-    do {                                                                        \
-        error = (result);                                                       \
-        if (QNN_SUCCESS != error) {                                             \
-            if (error == QNN_COMMON_ERROR_NOT_SUPPORTED) {                      \
-                GGMLQNN_LOG_WARN("WARNING: QNN feature/API not supported\n");   \
-            } else {                                                            \
-                GGMLQNN_LOG_INFO("QNN API error = %d(%s)\n", error, ggmlqnn_get_error_string(error));  \
-            }                                                                   \
-        }                                                                       \
-    } while (0)
-
-#define QNN_VER_PTR(x)                          (&((x).v1))
-#define RPCMEM_DEFAULT_FLAGS                    1
-#define RPCMEM_HEAP_ID_SYSTEM                   25
-
-#define DISABLE_COPY(class_name)                \
-    class_name(const class_name &) = delete;    \
-    void operator=(const class_name &) = delete
-
-#define DISABLE_MOVE(class_name)                \
-    class_name(class_name &&) = delete;         \
-    void operator=(class_name &&) = delete
-
-#define GQCGT                                   ggmlqnn_create_general_tensor
-
-//#if defined(_WIN32)
-#if !defined(__ANDROID__) && !defined(__linux__)
-#define RTLD_GLOBAL 0x100
-#define RTLD_LOCAL  0x000
-#define RTLD_LAZY   0x000
-#define RTLD_NOW    0x001
-void *              dlopen(const char * filename, int flag);
-int                 dlclose(void * handle);
-void *              dlsym(void* handle, const char* name);
-const char *        dlerror(void);
-#endif
-
-using pfn_rpc_mem_init                          = void (*)(void);
-using pfn_rpc_mem_deinit                        = void (*)(void);
-using pfn_rpc_mem_alloc                         = void *(*)(int, uint32_t, int);
-using pfn_rpc_mem_free                          = void (*)(void *);
-using pfn_rpc_mem_to_fd                         = int (*)(void *);
-using _pfn_QnnSaver_initialize                  = decltype(QnnSaver_initialize);
-using _pfn_QnnInterface_getProviders            = decltype(QnnInterface_getProviders);
-using _pfn_QnnSystemInterface_getProviders      = decltype(QnnSystemInterface_getProviders);
-
-using qnn_res_t                                 = std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>;
-using qnn_tensors_t                             = std::vector< Qnn_Tensor_t *>;
-
-enum class ggml_qnn_profile_level {
-    profile_off     = 0,
-    profile_basic   = 1,
-    profile_detail  = 2
-};
-
-enum qcom_htp_arch {
-    NONE = 0,
-    V68 = 68,
-    V69 = 69,
-    V73 = 73,
-    V75 = 75,
-    V79 = 79,
-};
-
-enum qcom_chipset_soc_model {
-    UNKNOWN_SM = 0,
-    SM7450 = 41,  // v69, 7 Gen1
-    SM8350 = 30,  // v68, 888
-    SM8450 = 36,  // v69, SD 8 Gen 1
-    SM8475 = 42,  // v69, SD 8+ Gen 1
-    SM8550 = 43,  // v73, SD 8 Gen 2
-    SM8650 = 57,  // v75, SD 8 Gen 3
-    SM8750 = 69,  // v79, SD 8 Gen 4
-#if !defined(__ANDROID__) && !defined(__linux__)
-    SC7280X     = 44,
-    SC8280X     = 37,
-    SC8380XP    = 60,
-#endif
-};
-
-struct qcom_socinfo {
-    uint32_t soc_model;
-    size_t htp_arch;
-    size_t vtcm_size_in_mb;
-    char soc_desc[GGML_MAX_NAME];
-};
-
-struct ggml_backend_qnn_context {
-    int device;
-    int threads;
-    char name[GGML_MAX_NAME];
-    char desc[GGML_MAX_NAME];
-    char lib[GGML_MAX_NAME];
-    qnn_instance * instance;
-    struct ggml_backend * backend;
-    QNN_INTERFACE_VER_TYPE raw_interface;
-    QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
-    struct qcom_socinfo           socinfo;
-
-    std::unique_ptr<char[]> work_data;
-    std::vector<std::future<void>> tasks;
-    size_t work_size    = 0;
-    size_t desired_size = 0;
-    int n_threads       = GGML_DEFAULT_N_THREADS;
-};
-
-struct qnn_op_caps_t {
-    const char * qnn_op_name        = nullptr;
-    const size_t input_param_count  = 0;
-    const char * qnn_param_name     = nullptr;
-};
-extern const qnn_op_caps_t ggmlqnn_k_op_caps[];
-
-#if ENABLE_QNNBACKEND_PERF
-class qnn_perf {
-public:
-    qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
-    qnn_perf() = delete;
-    qnn_perf(const qnn_perf & ) = delete;
-    qnn_perf & operator= (const qnn_perf & ) = delete;
-
-    void start() {
-        _begin_time = ggml_time_us();
-    }
-
-    void info() {
-        _end_time = ggml_time_us();
-        _duration = (_end_time - _begin_time);
-        GGMLQNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
-    }
-
-private:
-    int64_t _begin_time = 0LL;
-    int64_t _end_time   = 0LL;
-    int64_t _duration   = 0LL;
-    std::string _perf_name;
-};
-#else
-class qnn_perf {
-public:
-    qnn_perf(const std::string & perf_name) {
-        GGML_UNUSED(perf_name);
-    }
-    qnn_perf() = delete;
-    qnn_perf(const qnn_perf & ) = delete;
-    qnn_perf & operator= (const qnn_perf & ) = delete;
-
-    void start() {}
-    void info() {}
-};
-#endif
-
-class qnn_interface {
-#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name)           \
-  template <typename... Args>                                     \
-  inline auto qnn_##F(Args... args) const {                       \
-    return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \
-        std::forward<Args>(args)...);                             \
-  }
-
-
-#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name)                  \
-  template <typename... Args>                                                \
-  inline auto qnn_##F(Args... args) const {                                  \
-    return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \
-        std::forward<Args>(args)...);                                        \
-  }
-
-    friend class qnn_instance;
-
-public:
-    qnn_interface() = default;
-
-    // QnnBackend
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion)
-
-    // QnnDevice
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo)
-
-    // QnnContext
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree)
-
-    // QnnGraph
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve)
-
-    // QnnLog
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel)
-
-    // QnnProfile
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree)
-
-    // QnnMem
-    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister)
-
-    // QnnProperty
-    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability)
-
-    // QnnTensor
-    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor)
-
-    // QnnSystem
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate)
-
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo)
-
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree)
-
-    void set_qnn_interface(const QnnInterface_t * qnn_interface) {
-        _qnn_interface = qnn_interface;
-    }
-
-    void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) {
-        _qnn_sys_interface = qnn_sys_interface;
-    }
-
-    uint32_t get_backend_id() const {
-        return _qnn_interface->backendId;
-    }
-
-    bool is_loaded() const {
-        return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr));
-    }
-
-private:
-    const QnnInterface_t * _qnn_interface           = nullptr;
-
-    const QnnSystemInterface_t * _qnn_sys_interface = nullptr;
-};
-
-class qnn_instance {
-public:
-    using BackendIdType = decltype(QnnInterface_t{}.backendId);
-
-    explicit qnn_instance(const std::string & lib_path, const std::string & backend_name,
-                          const std::string & model_name) :
-            _lib_path(std::move(lib_path)),
-            _backend_name(std::move(backend_name)),
-            _model_name(std::move(model_name)) {}
-
-    ~qnn_instance() {
-    }
-
-    int qnn_init(const QnnSaver_Config_t ** saver_config);
-
-    int qnn_finalize();
-
-    const qnn_interface & get_qnn_interface() {
-        if (!_qnn_interface.is_loaded()) {
-            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
-        }
-        return _qnn_interface;
-    }
-
-    const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() {
-        if (!_qnn_interface.is_loaded()) {
-            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
-        }
-        return _qnn_raw_interface;
-    }
-
-    const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() {
-        if (!_qnn_interface.is_loaded()) {
-            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
-        }
-        return _qnn_raw_system_interface;
-    }
-
-    Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
-
-    Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
-
-    Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
-
-    Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
-
-    Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
-
-    QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
-
-    Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
-
-    int init_qnn_graph(const char * graph_name,
-                       bool debug,
-                       uint8_t do_node_validation = 1,
-                       const QnnGraph_Config_t ** graph_configs = nullptr
-    );
-    int init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
-
-    int finalize_qnn_graph();
-
-    bool is_valid_graph() const { return _qnn_graph_handle != nullptr; }
-
-    int init_htp_perfinfra();
-
-    int set_rpc_polling();
-
-    int set_high_performance_mode();
-
-    std::string & get_qnn_graph_name() { return _graph_name; }
-
-    bool is_rpcmem_initialized() {
-        return _rpcmem_initialized;
-    }
-
-    void set_rpcmem_initialized(bool initialized) {
-        _rpcmem_initialized = initialized;
-    }
-
-    size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
-    size_t get_rpcmem_usage() { return _rpcmem_usage; }
-
-    int32_t rpcmem_to_fd(void * buf);
-
-    int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor);
-    Qnn_MemHandle_t  register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type);
-
-    void unregister_rpcmem();
-    void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
-
-    void * alloc_rpcmem(size_t bytes, size_t alignment);
-    void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle);
-
-    void free_rpcmem(void * buf);
-    void free_rpcmem();
-
-    bool is_rpcmem_allocated(void * buf);
-
-    bool is_rpcmem_registered(Qnn_MemHandle_t handle) {
-        return _qnn_mem_set.count(handle) != 0U;
-    }
-
-    bool enable_qnn_rpc() {
-        return _enable_qnn_rpc;
-    }
-
-    QNNBackend get_device_id() {
-        return _device_id;
-    }
-
-public:
-    std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>> _qnn_graph_map;
-
-private:
-    int load_system();
-
-    int unload_system();
-
-    int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config);
-
-    int unload_backend();
-
-    void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) {
-        _qnn_raw_interface = raw_interface;
-    }
-
-    void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) {
-        _qnn_raw_system_interface = raw_interface;
-    }
-
-    void * alloc_rpcmem_internal(size_t bytes, size_t alignment);
-
-    void probe_device_meminfo();
-
-private:
-    static constexpr const int _required_num_providers = 1;
-
-private:
-    std::string     _lib_path;
-    std::string     _backend_name;
-    std::string     _model_name; // name of prebuilt QNN model, might be used in the future
-    BackendIdType   _backend_id;
-
-    bool _debug_tensor                      = false; // flag to indicate if requested graph is to be run in debug mode
-    bool _do_node_validations               = true;  // flag to indicate whether all add_node calls need to be validated
-    QnnLog_Level_t _qnn_log_level           = QNN_LOG_LEVEL_DEBUG;
-
-    ggml_qnn_profile_level _profile_level   = ggml_qnn_profile_level::profile_detail;
-
-    void * _system_lib_handle               = nullptr;
-
-    Qnn_GraphHandle_t _qnn_graph_handle     = nullptr;
-
-    Qnn_LogHandle_t _qnn_log_handle         = nullptr;
-
-    Qnn_ProfileHandle_t _qnn_profile_handle = nullptr;
-
-    Qnn_DeviceHandle_t _qnn_device_handle   = nullptr;
-
-    Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
-
-    Qnn_ContextHandle_t _qnn_context_handle = nullptr;
-
-    QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
-
-    QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr;
-    uint32_t _qnn_power_configid            = 1;
-    uint32_t _qnn_rpc_pollingtime           = 9999; // 0-10000 us for high performing
-
-    qnn_interface _qnn_interface;
-    QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
-    QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
-
-    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
-    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
-
-    static std::mutex _init_mutex;
-    static std::unordered_map<BackendIdType, void *> _loaded_lib_handle;
-    static std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
-    static std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
-
-    std::atomic_bool _rpcmem_initialized{false};
-    pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
-    pfn_rpc_mem_free _pfn_rpc_mem_free;
-    pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
-    pfn_rpc_mem_init  _pfn_rpc_mem_init;
-    pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
-    std::unordered_map<void *, void *> _rpcmem_store_map;
-    std::unordered_map<void *, size_t> _rpcmem_usage_map;
-    size_t                             _rpcmem_usage    = 0;   // mempool usage in Mbytes
-    size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
-
-    std::string _graph_name;
-    QNNBackend _device_id;
-    void * _rpc_lib_handle      = nullptr;
-    bool       _enable_qnn_rpc  = false; //TODO:unknown issue with QNN RPC feature
-
-    DISABLE_COPY(qnn_instance);
-    DISABLE_MOVE(qnn_instance);
-};
-
-size_t         ggmlqnn_get_opcaps_size(void);
-size_t         ggmlqnn_get_op_index(const ggml_tensor * tensor);
-const char   * ggmlqnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code);
-Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype);
-void         * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op);
-void           ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & output);
-uint8_t      * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata);
-Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn_GraphHandle_t handle, const ggml_tensor * tensor, Qnn_TensorType_t tensor_type);
-void           ggmlqnn_print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-
-Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
-                                Qnn_Param_t * params, uint32_t num_params,
-                                Qnn_Tensor_t * inputs, uint32_t num_inputs,
-                                Qnn_Tensor_t * outputs, uint32_t num_outputs);
-Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
-                                Qnn_TensorType_t qnn_tensor_type,
-                                Qnn_DataType_t qnn_data_type,
-                                uint32_t rank, uint32_t * dims,
-                                void * data, uint32_t data_size,
-                                bool b_transpose = false);
diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp b/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
deleted file mode 100644
index 6ade24315f99a..0000000000000
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.cpp
+++ /dev/null
@@ -1,687 +0,0 @@
-/*
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#include "ggml-impl.h"
-#include "ggml-common.h"
-#include "ggml-qnn-ops.h"
-
-static inline uint32_t ggmlqnn_get_tensor_data_size(const ggml_tensor * tensor) {
-    /*
-    size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
-    size_t n_dims = ggml_get_tensor_rank(tensor);
-    for (int i = 1; i < n_dims; i++) {
-        data_size *= tensor->ne[i];
-    }
-
-    return data_size;
-    */
-    return ggml_nbytes(tensor);
-}
-
-static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                             const ggml_tensor * src1, ggml_tensor * dst) {
-    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
-        GGMLQNN_LOG_WARN("invalid params\n");
-        return false;
-    }
-
-    qnn_instance * instance = ctx->instance;
-    if (nullptr == instance) {
-        GGMLQNN_LOG_WARN("invalid params\n");
-        return false;
-    }
-
-    return true;
-}
-
-#define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                          \
-    do {                                                                    \
-        if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {       \
-            return;                                                         \
-        }                                                                   \
-    } while (0)
-
-/*
- * provide a general skeleton to offload ggml op to QNN backend: a single node contains 2 input
- * tensor and 1 output tensor
-*/
-void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
-    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
-    qnn_instance * instance                     = nullptr;
-    Qnn_GraphHandle_t graph_handle              = nullptr;
-    Qnn_Tensor_t * p_tensor0                    = nullptr;
-    Qnn_Tensor_t * p_tensor1                    = nullptr;
-    Qnn_Tensor_t * p_tensor2                    = nullptr;
-    const ggml_tensor * src0                    = op->src[0];
-    const ggml_tensor * src1                    = op->src[1];
-    ggml_tensor * dst                           = op;
-
-    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-    instance                                    = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-    size_t qnn_op_index                         = ggmlqnn_get_op_index(op);
-    GGML_ASSERT(qnn_op_index < ggmlqnn_get_opcaps_size());
-    const char * qnn_op_name                    = ggmlqnn_k_op_caps[qnn_op_index].qnn_op_name;
-    std::string ggml_op_name_string             = std::string("ggml_") + ggml_op_name(op->op);
-    const char * ggml_op_name                   = ggml_op_name_string.c_str();
-
-    qnn_perf op_perf                            = qnn_perf(ggml_op_name);
-    op_perf.start();
-
-    //ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
-    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
-
-    std::string graph_name;
-    ggmlqnn_get_graphkey_from_op(op, graph_name);
-    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
-        //retrieve computational resource from cached QNN graph
-        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
-        graph_handle            = std::get<0>(graph_item);
-        qnn_tensors_t & tensor  = std::get<1>(graph_item);
-        p_tensor0               = tensor[0];
-        p_tensor1               = tensor[1];
-        p_tensor2               = tensor[2];
-    } else {
-        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
-        GGML_ASSERT(instance->get_device_id() == ctx->device);
-        //create QNN graph
-        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
-            return;
-        }
-        graph_handle = instance->get_qnn_graph_handle();
-
-        //create computational tensor
-        p_tensor0 = ggmlqnn_create_compute_tensor(instance, graph_handle, src0, QNN_TENSOR_TYPE_APP_WRITE);
-        p_tensor1 = ggmlqnn_create_compute_tensor(instance, graph_handle, src1, QNN_TENSOR_TYPE_APP_WRITE);
-        p_tensor2 = ggmlqnn_create_compute_tensor(instance, graph_handle, dst,  QNN_TENSOR_TYPE_APP_READ);
-
-        //compose QNN graph
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
-                *p_tensor2
-        };
-        Qnn_OpConfig_t op_config = {
-                QNN_OPCONFIG_VERSION_1, {
-                        ggml_op_name,
-                        QNN_OP_PACKAGE_NAME_QTI_AISW,
-                        qnn_op_name,
-                        0,
-                        nullptr,
-                        2,
-                        tensor_inputs,
-                        1,
-                        tensor_outputs
-                }
-        };
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
-        //finalize QNN graph
-        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-
-        //cache QNN graph
-        qnn_tensors_t ggml_op_add_tensors;
-        ggml_op_add_tensors.reserve(3);
-        ggml_op_add_tensors.push_back(p_tensor0);
-        ggml_op_add_tensors.push_back(p_tensor1);
-        ggml_op_add_tensors.push_back(p_tensor2);
-        auto  graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
-        instance->_qnn_graph_map[graph_name] = graph_item;
-    }
-
-    if (enable_npu_rpc) {
-        uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
-        GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
-        if (nullptr != qnn_buffer_0) {
-            memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
-        }
-
-        uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor1)->memHandle));
-        GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
-        if (nullptr != qnn_buffer_1) {
-            memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
-        }
-    } else {
-        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
-    }
-
-    Qnn_Tensor_t tensor_inputs[] = {
-            *p_tensor0,
-            *p_tensor1
-    };
-    Qnn_Tensor_t tensor_outputs[] = {
-            *p_tensor2
-    };
-    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                        tensor_inputs, 2,
-                                                        tensor_outputs, 1,
-                                                        nullptr, nullptr));
-    if (enable_npu_rpc) {
-        //TODO:NPU RPC feature will failed with test-backend-ops
-        uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
-        if (nullptr != qnn_buffer_2) {
-            memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
-        }
-    }
-
-#if GGMLQNN_PRINT_OP_ADD_LOG
-    op_perf.info();
-#endif
-}
-
-/*
- * this function is AI-assisted code from Grok 3 for purpose of offload 4d matrix mulmat to QNN backend
- * UT in ggml-qnn-ut.cpp passed:
- * ./scripts/build-run-android.sh run_ut_mulmat 0
- * ./scripts/build-run-android.sh run_ut_mulmat 1
- * ./scripts/build-run-android.sh run_ut_mulmat 2
- *
- * the logic of ggml_qnn_mul_mat_4d is similar to ggml_qnn_mul_mat but much more complicated
- * than ggml_qnn_mul_mat, so it's a standalone function.
- * it will be combined with ggml_qnn_mul_mat in the future
- */
-static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    bool graph_initialized = false;
-    qnn_perf op_perf = qnn_perf("ggml_qnn_mul_mat_4d");
-    qnn_instance *instance = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
-
-    const ggml_tensor *src0 = op->src[0];
-    const ggml_tensor *src1 = op->src[1];
-    ggml_tensor *dst = op;
-
-    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-    GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4);
-    op_perf.start();
-
-    std::string graph_name;
-    ggmlqnn_get_graphkey_from_op(op, graph_name);
-    GGMLQNN_LOG_DEBUG("graph name %s\n", graph_name.c_str());
-
-    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
-
-    Qnn_GraphHandle_t graph_handle = nullptr;
-    Qnn_Tensor_t *p_tensor0 = nullptr;
-    Qnn_Tensor_t *p_reshape0_out = nullptr;
-    Qnn_Tensor_t *p_tile0_out = nullptr;
-    Qnn_Tensor_t *p_tensor1 = nullptr;
-    Qnn_Tensor_t *p_permute1_out = nullptr;
-    Qnn_Tensor_t *p_reshape1_out = nullptr;
-    Qnn_Tensor_t *p_matmul_out = nullptr;
-    Qnn_Tensor_t *p_reshape2_out = nullptr;
-
-    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
-        graph_initialized = true;
-        qnn_res_t &graph_item = instance->_qnn_graph_map[graph_name];
-        graph_handle = std::get<0>(graph_item);
-        qnn_tensors_t &tensors = std::get<1>(graph_item);
-        p_tensor0 = tensors[0];
-        p_reshape0_out = tensors[1];
-        p_tile0_out = tensors[2];
-        p_tensor1 = tensors[3];
-        p_permute1_out = tensors[4];
-        p_reshape1_out = tensors[5];
-        p_matmul_out = tensors[6];
-        p_reshape2_out = tensors[7];
-    } else {
-        CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
-                                                           graph_name.c_str(), NULL, &graph_handle));
-
-        // Define dimensions
-        uint32_t K = src0->ne[0];               // Inner dimension
-        uint32_t M = src0->ne[1];               // Rows of src0
-        uint32_t N = src1->ne[1];               // Columns of src1
-        uint32_t B0 = src0->ne[2] * src0->ne[3]; // src0 batch
-        uint32_t B1 = src1->ne[2] * src1->ne[3]; // src1 batch (drives output)
-
-        // Validate K only
-        GGML_ASSERT(src0->ne[0] == src1->ne[0]); // K must match
-
-        // src0: [K, M, H0, B0] -> QNN: [B0, H0, M, K]
-        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
-        p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
-                          src0_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
-
-        // Reshape src0 to [B0, M, K]
-        uint32_t reshape0_out_dims[] = {B0, M, K};
-        p_reshape0_out = GQCGT(nullptr, "reshape0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                               reshape0_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape0_out));
-        Qnn_Tensor_t reshape0_inputs[] = {*p_tensor0};
-        Qnn_Tensor_t reshape0_outputs[] = {*p_reshape0_out};
-        Qnn_OpConfig_t reshape0_op = ggmlqnn_create_op_config("reshape0", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                              QNN_OP_RESHAPE, nullptr, 0,
-                                                              reshape0_inputs, 1, reshape0_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape0_op));
-
-        // Tile src0 to match B1: [B0, M, K] -> [B1, M, K]
-        uint32_t tile0_out_dims[] = {B1, M, K};
-        p_tile0_out = GQCGT(nullptr, "tile0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                            tile0_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tile0_out));
-        uint32_t tile_multiples[] = {B1 / B0, 1, 1};
-        uint32_t tile_dims[] = {3};
-        Qnn_Tensor_t *p_tile_multiples = GQCGT(nullptr, "tile_multiples", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
-                                               tile_dims, tile_multiples, sizeof(tile_multiples));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tile_multiples));
-        Qnn_Param_t tile_params[] = {{QNN_PARAMTYPE_TENSOR, "multiples", .tensorParam = *p_tile_multiples}};
-        Qnn_Tensor_t tile0_inputs[] = {*p_reshape0_out};
-        Qnn_Tensor_t tile0_outputs[] = {*p_tile0_out};
-        Qnn_OpConfig_t tile0_op = ggmlqnn_create_op_config("tile0", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                           QNN_OP_TILE, tile_params, 1,
-                                                           tile0_inputs, 1, tile0_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, tile0_op));
-
-        // src1: [N, K, H1, B1] -> QNN: [B1, H1, N, K]
-        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
-        p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
-                          src1_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
-
-        // Permute src1 to [B1, H1, K, N]
-        uint32_t perm_data[] = {0, 1, 3, 2};
-        uint32_t perm_dims[] = {4};
-        Qnn_Tensor_t *p_perm = GQCGT(nullptr, "perm", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
-                                     perm_dims, perm_data, sizeof(perm_data));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_perm));
-        uint32_t permute1_out_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1])};
-        p_permute1_out = GQCGT(nullptr, "permute1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
-                               permute1_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_permute1_out));
-        Qnn_Param_t permute1_params[] = {{QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_perm}};
-        Qnn_Tensor_t permute1_inputs[] = {*p_tensor1};
-        Qnn_Tensor_t permute1_outputs[] = {*p_permute1_out};
-        Qnn_OpConfig_t permute1_op = ggmlqnn_create_op_config("permute1", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                              QNN_OP_TRANSPOSE, permute1_params, 1,
-                                                              permute1_inputs, 1, permute1_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, permute1_op));
-
-        // Reshape src1 to [B1, K, N]
-        uint32_t reshape1_out_dims[] = {B1, K, N};
-        p_reshape1_out = GQCGT(nullptr, "reshape1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                               reshape1_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape1_out));
-        Qnn_Tensor_t reshape1_inputs[] = {*p_permute1_out};
-        Qnn_Tensor_t reshape1_outputs[] = {*p_reshape1_out};
-        Qnn_OpConfig_t reshape1_op = ggmlqnn_create_op_config("reshape1", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                              QNN_OP_RESHAPE, nullptr, 0,
-                                                              reshape1_inputs, 1, reshape1_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape1_op));
-
-        // MatMul: [B1, M, K] x [B1, K, N] -> [B1, M, N]
-        uint32_t matmul_out_dims[] = {B1, M, N};
-        p_matmul_out = GQCGT(nullptr, "matmul_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                             matmul_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_matmul_out));
-        Qnn_Tensor_t matmul_inputs[] = {*p_tile0_out, *p_reshape1_out};
-        Qnn_Tensor_t matmul_outputs[] = {*p_matmul_out};
-        Qnn_OpConfig_t matmul_op = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                            QNN_OP_MAT_MUL, nullptr, 0,
-                                                            matmul_inputs, 2, matmul_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
-
-        // Output: [N, M, H1, B1] -> QNN: [B1, H1, M, N]
-        uint32_t reshape2_out_dims[] = {static_cast<uint32_t>(dst->ne[3]), static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[0])};
-        p_reshape2_out = GQCGT(dst, "output", QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
-                               reshape2_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape2_out));
-        Qnn_Tensor_t reshape2_inputs[] = {*p_matmul_out};
-        Qnn_Tensor_t reshape2_outputs[] = {*p_reshape2_out};
-        Qnn_OpConfig_t reshape2_op = ggmlqnn_create_op_config("reshape2", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                              QNN_OP_RESHAPE, nullptr, 0,
-                                                              reshape2_inputs, 1, reshape2_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape2_op));
-
-        // Finalize
-        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
-
-        // Cache
-        qnn_tensors_t ggml_op_mulmat_tensors = {p_tensor0, p_reshape0_out, p_tile0_out, p_tensor1, p_permute1_out, p_reshape1_out, p_matmul_out, p_reshape2_out};
-        instance->_qnn_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
-    }
-
-    // Execute
-    QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
-    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
-    QNN_VER_PTR(*p_reshape2_out)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
-
-    Qnn_Tensor_t input_tensors[] = {*p_tensor0, *p_tensor1};
-    Qnn_Tensor_t output_tensors[] = {*p_reshape2_out};
-    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2,
-                                                        output_tensors, 1, NULL, NULL));
-
-#if 0
-    // Log dst for debugging
-    float *dst_data = (float *)dst->data;
-    GGMLQNN_LOG_DEBUG("dst shape: [%d, %d, %d, %d]\n", dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
-    for (int i = 0; i < dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3]; i++) {
-        GGMLQNN_LOG_DEBUG("dst[%d] = %f\n", i, dst_data[i]);
-    }
-#endif
-
-    op_perf.info();
-}
-
-/*
- * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
- *        using the QNN backend. this function performs matrix multiplication of the input tensor
- *        `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
- *        and stores the result in the destination tensor `dst`.
- *
-         there are two key-points in properly handling how to offload mulmat to the QNN backend in ggml-qnn
-         1. transpose
-            a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from:
-            struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
-            which like this:
-            +---+---+
-            | 0 | 1 |
-            +---+---+
-            | 2 | 3 |
-            +---+---+
-            | 4 | 5 |
-            +---+---+
-            with
-                ne[0] = 2
-                ne[1] = 3
-            there are different dimension order between ggml tensor and qnn tensor
-
-          2. QNN's MatMul can only support input tensors with rank >= 2
-
-             in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
-             operation when offloading mulmat to QNN backend. this implementation will handle transpose
-             in func ggml_qnn_create_general_tensor()
- *
- *        this function is a good example to illustrated the second technical approach "mapping the
- *        entire ggml computational graph to QNN graph" without complex C++ encapsulation. or another
- *        pipeline of "how to utilize the Hexagon NPU maximally through QNN SDK", details could be found at
- *        https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
- *
- * @param ctx     the context of ggml-qnn backend
- * @param op      the destination tensor where the result of the matrix multiplication will be stored.
- *
- * @note the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
- *       than ggml_qnn_general_node. so it's a standalone function. accordingly, this is another
- *       typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute
- *       time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds
- *       of MUL_MAT to compute:
- *       mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
- *       mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
- *       mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, Q6_K...)
- *                        and src1 is F32, src0 -> f32 in src0', then src0' * src1
-*/
-void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
-    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
-    qnn_perf op_perf                            = qnn_perf("ggml_qnn_mul_mat");
-    qnn_instance * instance                     = nullptr;
-    Qnn_GraphHandle_t graph_handle              = nullptr;
-    Qnn_Tensor_t * p_tensor0                    = nullptr;
-    Qnn_Tensor_t * p_tensor1                    = nullptr;
-    Qnn_Tensor_t * p_tensor2                    = nullptr;
-    Qnn_Tensor_t * p_param_tensor               = nullptr;
-    Qnn_Tensor_t * p_tensor2_transpose          = nullptr;
-    const ggml_tensor * src0                    = op->src[0];
-    const ggml_tensor * src1                    = op->src[1];
-    ggml_tensor       * dst                     = op;
-
-    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-    instance                                    = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-    op_perf.start();
-
-    const enum ggml_type src0_type              = src0->type;
-    const uint32_t src0_rank                    = ggml_n_dims(src0);
-    const uint32_t src1_rank                    = ggml_n_dims(src1);
-    GGML_ASSERT(src0_rank == src1_rank);
-    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
-    if (4 == src0_rank) {
-        return ggml_qnn_mul_mat_4d(ctx, op);
-    }
-    void * wdata                                = ggmlqnn_type_trait(ctx, op);
-    const size_t desired_size                   = ctx->desired_size;
-
-    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
-
-    std::string graph_name;
-    ggmlqnn_get_graphkey_from_op(op, graph_name);
-    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
-        //retrieve computational resource from cached QNN graph
-        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
-        graph_handle            = std::get<0>(graph_item);
-        qnn_tensors_t & tensors = std::get<1>(graph_item);
-        p_tensor0               = tensors[0];
-        p_tensor1               = tensors[1];
-        p_tensor2               = tensors[2];
-        p_param_tensor          = tensors[3];
-        p_tensor2_transpose     = tensors[4];
-    } else {
-        //create QNN graph
-        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
-        error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
-            return;
-        }
-
-        //create computational tensor
-        p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor1 = GQCGT(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor2 = GQCGT(dst, nullptr, QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
-
-        //create param tensor for offload 2d/3d/4d matrix multiplication
-        const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
-                {0},
-                {1, 0},
-                {0, 2, 1},
-                {0, 1, 3, 2},
-        };
-        uint32_t param_tensor_dims[1] = {src0_rank};
-        p_param_tensor = GQCGT(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, param_tensor_dims, (void *)(param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
-
-        //create transpose tensor
-        p_tensor2_transpose = GQCGT(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0, true);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2_transpose));
-
-        //compose QNN graph: add mulmat node
-        Qnn_Param_t out_0_params[]   = {{QNN_PARAMTYPE_SCALAR, QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}}};
-        Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0, *p_tensor1};
-        Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
-        Qnn_OpConfig_t out_0         = ggmlqnn_create_op_config("mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
-
-        //compose QNN graph: add transpose node
-        Qnn_Param_t out_trans1_0_params[]   = { {QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_param_tensor}};
-        Qnn_Tensor_t out_trans1_0_inputs[]  = {*p_tensor2_transpose};
-        Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
-        Qnn_OpConfig_t out_trans1_0         = ggmlqnn_create_op_config("mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
-
-        //finalize QNN graph
-        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-
-        //cache QNN graph
-        qnn_tensors_t ggml_op_mulmat_tensors;
-        ggml_op_mulmat_tensors.reserve(5);
-        ggml_op_mulmat_tensors.push_back(p_tensor0);
-        ggml_op_mulmat_tensors.push_back(p_tensor1);
-        ggml_op_mulmat_tensors.push_back(p_tensor2);
-        ggml_op_mulmat_tensors.push_back(p_param_tensor);
-        ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
-        auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
-        instance->_qnn_graph_map[graph_name] = graph_item;
-    }
-
-    if (src0_type != GGML_TYPE_F32) {
-        QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
-    } else {
-        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-    }
-    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-    QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
-
-    Qnn_Tensor_t tensor_inputs[] = {
-            *p_tensor0,
-            *p_tensor1
-    };
-    Qnn_Tensor_t tensor_outputs[] = {
-            *p_tensor2
-    };
-    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                        tensor_inputs, 2,
-                                                        tensor_outputs, 1,
-                                                        nullptr, nullptr));
-    op_perf.info();
-}
-
-void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-    GGML_UNUSED(value);
-}
-
-void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    ggml_qnn_dup(ctx, dst);
-}
-
-void ggml_qnn_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
-
-void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
diff --git a/ggml/src/ggml-qnn/ggml-qnn-ops.h b/ggml/src/ggml-qnn/ggml-qnn-ops.h
deleted file mode 100644
index b1c388a32a87a..0000000000000
--- a/ggml/src/ggml-qnn/ggml-qnn-ops.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2023-2024 The ggml authors
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#pragma once
-
-#include "ggml-qnn-impl.h"
-void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-
-void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value);
-void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 35b565c7d7669..083f3ec466528 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -1,17 +1,18 @@
 /*
- * Copyright (c) 2023-2024 The ggml authors
+ * Copyright (c) 2024- KanTV authors
  *
  * Qualcomm QNN SDK and reference tech guides could be found at:
  * https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
  * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
  *
- * the implementation of ggml-qnn backend has six sections:
+ * this single-source-file or self-contained implementation of ggml-qnn backend has seven sections:
  * section-1 does forward/external declaration,
  * section-2 defines ggml-qnn internal log function
  * section-3 does general helper macro / data structure / function
  * section-4 does QNN helper macro / data structure / function
  * section-5 does ggml-qnn backend helper macro / data structure / function / class
  * section-6 does implementation of ggml-qnn backend according to ggml's backend subsystem
+ * section-7 does implementation of offload ggml op to QNN backend
  *
  * currently provide following ggml ops' QNN backend implementation in ggml-qnn-ops.cpp:
  * - GGML_OP_ADD:    this is a simple skeleton, can expand other ggml ops according to expertise
@@ -36,19 +37,144 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */
-#include "ggml-qnn-impl.h"
-#include "ggml-qnn-ops.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <stddef.h>
+#include <inttypes.h>
+#include <math.h>
+#include <time.h>
+#if defined(__ANDROID__) || defined(__linux__)
+#include <unistd.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/sysinfo.h>
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <vector>
+#include <thread>
+#include <mutex>
+#include <map>
+#include <set>
+#include <tuple>
+#include <queue>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <chrono>
+#include <memory>
+#include <regex>
+#include <random>
+#include <functional>
+#include <unordered_map>
+#include <condition_variable>
+#include <cassert>
+#include <unordered_set>
+#include <utility>
+#include <stdatomic.h>
+#include <future>
+#if (defined __ANDROID__) || (defined ANDROID)
+#include "android/log.h"
+#endif
+
+#if !defined(__ANDROID__) && !defined(__linux__)
+#include <wchar.h>
+#include <malloc.h>
+#include <Windows.h>
+#endif
+
+#include "QnnTypes.h"
+#include "QnnCommon.h"
+#include "QnnContext.h"
+#include "QnnBackend.h"
+#include "QnnGraph.h"
+#include "QnnProperty.h"
+#include "QnnTensor.h"
+#include "QnnInterface.h"
+#include "Saver/QnnSaver.h"
+#include "System/QnnSystemInterface.h"
+#include "HTP/QnnHtpDevice.h"
+#include "HTP/QnnHtpGraph.h"
+
+#include "ggml-qnn.h"
+#include "ggml-impl.h"
+#include "ggml-backend-impl.h"
+
 // =================================================================================================
 //  section-1: forward/external declaration
 // =================================================================================================
-static int  free_qnn_tensor(Qnn_Tensor_t * tensor);
-static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+class  qnn_instance;
+struct ggml_backend_qnn_context;
 typedef void (* ggmlqnn_op_func_t)(ggml_backend_qnn_context * ctx, ggml_tensor * op);
 
+static Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
+                                             Qnn_TensorType_t qnn_tensor_type,
+                                             Qnn_DataType_t qnn_data_type,
+                                             uint32_t rank, uint32_t * dims,
+                                             void * data, uint32_t data_size,
+                                             bool b_transpose = false);
+
+static void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+
+static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value);
+static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+
 // =================================================================================================
-//  section-2: ggml-qnn internal troubleshooting function
+//  section-2: ggml-qnn internal troubleshooting function/class
 // =================================================================================================
-void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
+#if 0//def NDEBUG
+#define GGMLQNN_DEBUG                           0
+#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
+#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
+#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
+#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            0
+#else
+#define GGMLQNN_DEBUG                           1  // for troubleshooting QNN backend
+#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
+#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
+#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
+#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            1
+#endif
+#define GGML_QNN_LOGBUF_LEN                     4096
+
+#define GGMLQNN_LOG_ERROR(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_WARN(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_INFO(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+
+#if GGMLQNN_DEBUG
+#define GGMLQNN_LOG_DEBUG(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+#define GGMLQNN_LOG_DEBUG(...)
+#endif
+static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
     static std::mutex ggmlqnn_log_internal_mutex;
     static char s_ggmlqnn_log_internal_buf[GGML_QNN_LOGBUF_LEN];
 
@@ -78,13 +204,72 @@ void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char *
     }
 }
 
+#if ENABLE_QNNBACKEND_PERF
+class qnn_perf {
+public:
+    qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
+    qnn_perf() = delete;
+    qnn_perf(const qnn_perf & ) = delete;
+    qnn_perf & operator= (const qnn_perf & ) = delete;
+
+    void start() {
+        _begin_time = ggml_time_us();
+    }
+
+    void info() {
+        _end_time = ggml_time_us();
+        _duration = (_end_time - _begin_time);
+        GGMLQNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
+    }
+
+private:
+    int64_t _begin_time = 0LL;
+    int64_t _end_time   = 0LL;
+    int64_t _duration   = 0LL;
+    std::string _perf_name;
+};
+#else
+class qnn_perf {
+public:
+    qnn_perf(const std::string & perf_name) {
+        GGML_UNUSED(perf_name);
+    }
+    qnn_perf() = delete;
+    qnn_perf(const qnn_perf & ) = delete;
+    qnn_perf & operator= (const qnn_perf & ) = delete;
+
+    void start() {}
+    void info() {}
+};
+#endif
+
 // =================================================================================================
 //  section-3: general helper macro / data structure / function
 // =================================================================================================
+#define DISABLE_COPY(class_name)                \
+    class_name(const class_name &) = delete;    \
+    void operator=(const class_name &) = delete
+
+#define DISABLE_MOVE(class_name)                \
+    class_name(class_name &&) = delete;         \
+    void operator=(class_name &&) = delete
+
+#define GQCGT                                   ggmlqnn_create_general_tensor
+
+//#if defined(_WIN32)
 #if !defined(__ANDROID__) && !defined(__linux__)
+#define RTLD_GLOBAL 0x100
+#define RTLD_LOCAL  0x000
+#define RTLD_LAZY   0x000
+#define RTLD_NOW    0x001
+static void *              dlopen(const char * filename, int flag);
+static int                 dlclose(void * handle);
+static void *              dlsym(void* handle, const char* name);
+static const char *        dlerror(void);
+
 static const char * last_func = nullptr;
 static long last_err;
-void * dlopen(const char * dll, int flags) {
+static void * dlopen(const char * dll, int flags) {
   HINSTANCE h = LoadLibraryA(dll);
   GGML_UNUSED(flags);
   if (h == NULL) {
@@ -94,7 +279,7 @@ void * dlopen(const char * dll, int flags) {
   return h;
 }
 
-int dlclose(void * h) {
+static int dlclose(void * h) {
   if (!FreeLibrary((HINSTANCE)h)) {
     last_err  = GetLastError();
     last_func = "dlclose";
@@ -103,7 +288,7 @@ int dlclose(void * h) {
   return 0;
 }
 
-void * dlsym(void * h, const char * name) {
+static void * dlsym(void * h, const char * name) {
   FARPROC p = GetProcAddress((HINSTANCE)h, name);
   if (!p) {
     last_err  = GetLastError();
@@ -112,7 +297,7 @@ void * dlsym(void * h, const char * name) {
   return (void*)(intptr_t)p;
 }
 
-const char * dlerror(void) {
+static const char * dlerror(void) {
   static char str[512];
   if (!last_err) return nullptr;
 
@@ -256,6 +441,22 @@ static void * ggmlqnn_host_malloc(size_t buffer_size, size_t page_size) {
 // =================================================================================================
 //  section-4: QNN helper macro / data structure / function
 // =================================================================================================
+#define CHECK_QNN_API(error, result)                                            \
+    do {                                                                        \
+        error = (result);                                                       \
+        if (QNN_SUCCESS != error) {                                             \
+            if (error == QNN_COMMON_ERROR_NOT_SUPPORTED) {                      \
+                GGMLQNN_LOG_WARN("WARNING: QNN feature/API not supported\n");   \
+            } else {                                                            \
+                GGMLQNN_LOG_INFO("QNN API error = %d(%s)\n", error, ggmlqnn_get_error_string(error));  \
+            }                                                                   \
+        }                                                                       \
+    } while (0)
+
+#define QNN_VER_PTR(x)                                  (&((x).v1))
+#define RPCMEM_DEFAULT_FLAGS                            1
+#define RPCMEM_HEAP_ID_SYSTEM                           25
+
 #define QNN_TENSOR_GET_ID(tensor)                       get_qnn_tensorid(tensor)
 #define QNN_TENSOR_GET_NAME(tensor)                     get_qnn_tensorname(tensor)
 #define QNN_TENSOR_GET_TYPE(tensor)                     get_qnn_tensortype(tensor)
@@ -498,7 +699,7 @@ static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
     return err;
 }
 
-const char * ggmlqnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
+static const char * ggmlqnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
     // file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/api_error_codes.html
     switch (qnn_error_code) {
         case QNN_SUCCESS:
@@ -601,8 +802,7 @@ const char * ggmlqnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
     }
 }
 
-// helper function to create an operation config
-Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
+static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
                                        Qnn_Param_t * params, uint32_t num_params,
                                        Qnn_Tensor_t * inputs, uint32_t num_inputs,
                                        Qnn_Tensor_t * outputs, uint32_t num_outputs) {
@@ -619,6 +819,81 @@ Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package,
 // =================================================================================================
 //  section-5:ggml-qnn backend helper macro / data structure / function / class
 // =================================================================================================
+using pfn_rpc_mem_init                          = void (*)(void);
+using pfn_rpc_mem_deinit                        = void (*)(void);
+using pfn_rpc_mem_alloc                         = void *(*)(int, uint32_t, int);
+using pfn_rpc_mem_free                          = void (*)(void *);
+using pfn_rpc_mem_to_fd                         = int (*)(void *);
+using _pfn_QnnSaver_initialize                  = decltype(QnnSaver_initialize);
+using _pfn_QnnInterface_getProviders            = decltype(QnnInterface_getProviders);
+using _pfn_QnnSystemInterface_getProviders      = decltype(QnnSystemInterface_getProviders);
+
+using qnn_res_t                                 = std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>;
+using qnn_tensors_t                             = std::vector< Qnn_Tensor_t *>;
+
+enum class ggml_qnn_profile_level {
+    profile_off     = 0,
+    profile_basic   = 1,
+    profile_detail  = 2
+};
+
+enum qcom_htp_arch {
+    NONE = 0,
+    V68 = 68,
+    V69 = 69,
+    V73 = 73,
+    V75 = 75,
+    V79 = 79,
+};
+
+enum qcom_chipset_soc_model {
+    UNKNOWN_SM = 0,
+    SM7450 = 41,  // v69, 7 Gen1
+    SM8350 = 30,  // v68, 888
+    SM8450 = 36,  // v69, SD 8 Gen 1
+    SM8475 = 42,  // v69, SD 8+ Gen 1
+    SM8550 = 43,  // v73, SD 8 Gen 2
+    SM8650 = 57,  // v75, SD 8 Gen 3
+    SM8750 = 69,  // v79, SD 8 Gen 4
+#if !defined(__ANDROID__) && !defined(__linux__)
+    SC7280X     = 44,
+    SC8280X     = 37,
+    SC8380XP    = 60,
+#endif
+};
+
+struct qcom_socinfo {
+    uint32_t soc_model;
+    size_t htp_arch;
+    size_t vtcm_size_in_mb;
+    char soc_desc[GGML_MAX_NAME];
+};
+
+struct ggml_backend_qnn_context {
+    int device;
+    int threads;
+    char name[GGML_MAX_NAME];
+    char desc[GGML_MAX_NAME];
+    char lib[GGML_MAX_NAME];
+    qnn_instance * instance;
+    struct ggml_backend * backend;
+    QNN_INTERFACE_VER_TYPE raw_interface;
+    QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
+    struct qcom_socinfo           socinfo;
+
+    std::unique_ptr<char[]> work_data;
+    std::vector<std::future<void>> tasks;
+    size_t work_size    = 0;
+    size_t desired_size = 0;
+    int n_threads       = GGML_DEFAULT_N_THREADS;
+};
+
+struct qnn_op_caps_t {
+    const char * qnn_op_name        = nullptr;
+    const size_t input_param_count  = 0;
+    const char * qnn_param_name     = nullptr;
+};
+
 //file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
 static struct qcom_socinfo g_qnn_soc_info_table[] = {
         /* Qualcomm SnapDragon 7 Gen 1 */
@@ -772,7 +1047,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .socinfo              = {}},
 };
 
-const qnn_op_caps_t ggmlqnn_k_op_caps[] = {
+static const qnn_op_caps_t ggmlqnn_k_op_caps[] = {
         {}, // GGML_OP_NONE
         {}, // GGML_OP_DUP
         {
@@ -944,7 +1219,7 @@ static const char * get_ggml_type_name(ggml_type type) {
 }
 
 // ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
-Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
+static Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
     switch (ggmltype) {
         case GGML_TYPE_F16:
             return QNN_DATATYPE_FLOAT_16;
@@ -1003,127 +1278,8 @@ static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, c
     }
 }
 
-Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
-                                                     Qnn_TensorType_t qnn_tensor_type,
-                                                     Qnn_DataType_t qnn_data_type,
-                                                     uint32_t rank, uint32_t * dims,
-                                                     void * data, uint32_t data_size,
-                                                     bool b_transpose) {
-    Qnn_ErrorHandle_t error         = QNN_SUCCESS;
-    char tensor_name[GGML_MAX_NAME] = {};
-
-    //ensure the tensor name is unique
-    if (nullptr == name) {
-        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", get_idx());
-    } else {
-        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, get_idx());
-    }
-    GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
-    inc_idx();
-
-    uint32_t reverse_dims[GGML_MAX_DIMS]    = {};
-    uint32_t transpose_dims[GGML_MAX_DIMS]  = {};
-    uint32_t * tensor_dims                  = nullptr;
-    //case 1:use dims info from ggml tensor
-    if (nullptr != tensor) {
-        //there are different dimension order between ggml tensor and qnn tensor
-        for (size_t idx = 0; idx < rank; idx++) {
-            reverse_dims[idx] = (uint32_t)tensor->ne[rank - 1 - idx];
-        }
-        tensor_dims = reverse_dims;
-    }
-    //case 2: use user's specified tensor_dims
-    if (nullptr != dims) {
-        tensor_dims = dims;
-    }
-    //case 3: transpose for dst tensor
-    if (b_transpose) {
-        GGML_ASSERT(tensor != nullptr); //ensure ggml_tensor is not nullptr for this special case
-
-        get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_n_dims(tensor));
-        tensor_dims = transpose_dims;
-#if 0
-        for (size_t idx = 0; idx < 4; idx++) {
-            GGMLQNN_LOG_DEBUG("origin dim[%d]=%d\n", idx, reverse_dims[idx]);
-        }
-        for (size_t idx = 0; idx < 4; idx++) {
-            GGMLQNN_LOG_DEBUG("trans  dim[%d]=%d\n", idx, transpose_dims[idx]);
-        }
-#endif
-    }
-
-    Qnn_Tensor_t qnn_tensor = {
-            .version= QNN_TENSOR_VERSION_1,
-            {.v1= {
-                    .id = 0,
-                    .name = tensor_name,
-                    .type = qnn_tensor_type,
-                    .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
-                    .dataType = qnn_data_type,
-                    .quantizeParams = {.encodingDefinition = QNN_DEFINITION_UNDEFINED,
-                                       .quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED,
-                                       {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}},
-                    .rank = rank,
-                    .dimensions = tensor_dims,
-                    .memType = QNN_TENSORMEMTYPE_RAW,
-                    .clientBuf = {.data = nullptr, .dataSize = 0}
-            }
-            }
-    };
-    if (nullptr != name) {
-        QNN_VER_PTR(qnn_tensor)->name = name;
-    }
-    Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
-    if (nullptr == p_qnn_tensor) {
-        GGMLQNN_LOG_WARN("calloc failed");
-        return nullptr;
-    }
-    error = deep_copy_qnn_tensors(qnn_tensor, * p_qnn_tensor);
-    if (error != QNN_SUCCESS) {
-        free(p_qnn_tensor);
-        GGMLQNN_LOG_WARN("init tensor failed");
-        return  nullptr;
-    }
-    QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {data, data_size};
-
-    return p_qnn_tensor;
-}
-
-Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle, const ggml_tensor * tensor, Qnn_TensorType_t tensor_type) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    uint32_t dimensions[]   = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
-                               (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
-    Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
-    Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
-
-    if (0 == tensor->flags) {
-        qnn_tensor_type = tensor_type;
-    } else {
-        if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
-            qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
-        } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
-            qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
-        }
-    }
-
-    qnn_data_type = ggmlqnn_datatype_from_ggml_datatype(tensor->type);
-    Qnn_Tensor_t * p_qnn_tensor = ggmlqnn_create_general_tensor(tensor, nullptr,
-                                  qnn_tensor_type, qnn_data_type,
-                                  ggml_n_dims(tensor), dimensions,
-                                  nullptr, 0);
-
-    bool enable_npu_rpc = (instance->enable_qnn_rpc() && instance->get_device_id() == QNN_BACKEND_NPU);
-    if (enable_npu_rpc) {
-        QNN_VER_PTR(*p_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-        QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {.data=nullptr, .dataSize=0};
-    }
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = instance->get_qnn_raw_interface();
-    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_qnn_tensor));
-
-    return p_qnn_tensor;
-}
 
-void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
+static void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     const ggml_tensor * src0        = op->src[0];
     const ggml_tensor * src1        = op->src[1];
     ggml_tensor * dst               = op;
@@ -1216,11 +1372,11 @@ static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & o
     output.append(buffer, len);
 }
 
-size_t ggmlqnn_get_opcaps_size() {
+static size_t ggmlqnn_get_opcaps_size() {
     return std::size(ggmlqnn_k_op_caps);
 }
 
-size_t ggmlqnn_get_op_index(const ggml_tensor * tensor) {
+static size_t ggmlqnn_get_op_index(const ggml_tensor * tensor) {
     if (tensor->op == GGML_OP_UNARY) {
         return static_cast<size_t>(GGML_OP_COUNT) + static_cast<size_t>(ggml_get_unary_op(tensor));
     }
@@ -1234,7 +1390,7 @@ static size_t ggmlqnn_get_op_input_param_count(const ggml_tensor * op) {
     return ggmlqnn_k_op_caps[op_index].input_param_count;
 }
 
-void ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & output) {
+static void ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & output) {
     GGML_ASSERT(op->op != GGML_OP_NONE);
     output += ggml_op_desc(op);
     output += get_ggml_type_name(op->type);
@@ -1254,118 +1410,445 @@ Fn load_qnn_functionpointers(void * handle, const char * function_name) {
     return reinterpret_cast<Fn>(dlsym(handle, function_name));
 }
 
-std::mutex qnn_instance::_init_mutex;
-std::unordered_map<qnn_instance::BackendIdType, void *> qnn_instance::_loaded_lib_handle;
-std::unordered_map<std::string, qnn_instance::BackendIdType> qnn_instance::_lib_path_to_backend_id;
-std::unordered_map<qnn_instance::BackendIdType, const QnnInterface_t *> qnn_instance::_loaded_backend;
+class qnn_interface {
+#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name)           \
+  template <typename... Args>                                     \
+  inline auto qnn_##F(Args... args) const {                       \
+    return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \
+        std::forward<Args>(args)...);                             \
+  }
 
-void * qnn_instance::alloc_rpcmem_internal(size_t bytes, size_t alignment) {
-    if (!_rpcmem_initialized) {
-        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
-        return nullptr;
-    }
 
-    auto allocate_bytes = static_cast<int32_t>(bytes + alignment);
-    void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes);
-    if (nullptr == buf) {
-        GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
-        return nullptr;
-    }
+#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name)                  \
+  template <typename... Args>                                                \
+  inline auto qnn_##F(Args... args) const {                                  \
+    return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \
+        std::forward<Args>(args)...);                                        \
+  }
 
-    auto aligned_buf = reinterpret_cast<void *>(ggmlqnn_align_to(alignment,
-                                                reinterpret_cast<intptr_t>(buf)));
-    bool status = _rpcmem_store_map.insert(std::pair<void *, void *>(aligned_buf, buf)).second;
-    if (!status) {
-        GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
-        _pfn_rpc_mem_free(buf);
-    }
-    return aligned_buf;
-}
+    friend class qnn_instance;
 
-void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
-    if (_rpcmem_usage > (_rpcmem_capacity - 8)) { // reserve 8Mbytes in rpc mempool
-        GGMLQNN_LOG_WARN("rpc mempool capcaity: %d MB, usage: %d MB", _rpcmem_capacity, _rpcmem_usage);
-        return nullptr;
-    }
+public:
+    qnn_interface() = default;
 
-    auto aligned_buf = alloc_rpcmem_internal(bytes, alignment);
-    if (nullptr == aligned_buf)
-        return nullptr;
-    _rpcmem_usage_map.insert(std::pair<void *, size_t>(aligned_buf, bytes));
+    // QnnBackend
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate)
 
-    size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
-    rpcmem_usage_in_bytes += bytes;
-    _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
-    return aligned_buf;
-}
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree)
 
-void qnn_instance::free_rpcmem(void * buf) {
-    size_t rpcbuffer_size = 0;
-    if (!_rpcmem_initialized) {
-        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
-    } else if (0 == _rpcmem_store_map.count(buf)) {
-        GGMLQNN_LOG_WARN("no allocated tensor\n");
-    } else {
-        GGMLQNN_LOG_DEBUG("free rpc mem %p", _rpcmem_store_map[buf]);
-        for (std::unordered_map<void *, size_t>::iterator it = _rpcmem_usage_map.begin();
-             it != _rpcmem_usage_map.end();
-             it++) {
-            void * rpcbuffer = it->first;
-            if (buf == rpcbuffer) {
-                rpcbuffer_size = it->second;
-                size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
-                rpcmem_usage_in_bytes -= rpcbuffer_size;
-                _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
-            }
-        }
-        if (rpcbuffer_size != 0) {
-            _rpcmem_usage_map.erase(buf);
-        } else {
-            GGMLQNN_LOG_WARN("it shouldn't happen, pls check why?");
-        }
-        _pfn_rpc_mem_free(_rpcmem_store_map[buf]);
-        _rpcmem_store_map.erase(buf);
-    }
-}
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage)
 
-void qnn_instance::free_rpcmem() {
-    if (_rpcmem_store_map.empty()) {
-        GGMLQNN_LOG_WARN("no rpcmem allocated\n");
-        return;
-    }
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig)
 
-    for (std::unordered_map<void *, void *>::iterator it = _rpcmem_store_map.begin();
-         it != _qnn_mem_set.end();
-         it++) {
-        void * rpcbuffer = it->second;
-        GGMLQNN_LOG_DEBUG("free rpc buffer %p", rpcbuffer);
-        _pfn_rpc_mem_free(rpcbuffer);
-    }
-    _rpcmem_store_map.clear();
-    _rpcmem_usage_map.clear();
-    _rpcmem_usage = 0;
-}
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion)
 
-int32_t qnn_instance::rpcmem_to_fd(void * buf) {
-    int32_t mem_fd = -1;
-    if (!is_rpcmem_initialized()) {
-        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
-    } else {
-        mem_fd = _pfn_rpc_mem_to_fd(buf);
-    }
+    // QnnDevice
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate)
 
-    return mem_fd;
-}
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree)
 
-int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
-    if (nullptr == p_data || (nullptr == p_tensor)) {
-        GGMLQNN_LOG_WARN("invalid param\n");
-        return 1;
-    }
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure)
 
-    if (!is_rpcmem_initialized()) {
-        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
-        return 2;
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo)
+
+    // QnnContext
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree)
+
+    // QnnGraph
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve)
+
+    // QnnLog
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel)
+
+    // QnnProfile
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree)
+
+    // QnnMem
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister)
+
+    // QnnProperty
+    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability)
+
+    // QnnTensor
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor)
+
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor)
+
+    // QnnSystem
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate)
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo)
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree)
+
+    void set_qnn_interface(const QnnInterface_t * qnn_interface) {
+        _qnn_interface = qnn_interface;
+    }
+
+    void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) {
+        _qnn_sys_interface = qnn_sys_interface;
+    }
+
+    uint32_t get_backend_id() const {
+        return _qnn_interface->backendId;
+    }
+
+    bool is_loaded() const {
+        return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr));
+    }
+
+private:
+    const QnnInterface_t * _qnn_interface           = nullptr;
+
+    const QnnSystemInterface_t * _qnn_sys_interface = nullptr;
+};
+
+class qnn_instance {
+public:
+    using BackendIdType = decltype(QnnInterface_t{}.backendId);
+
+    explicit qnn_instance(const std::string & lib_path, const std::string & backend_name,
+                          const std::string & model_name) :
+            _lib_path(std::move(lib_path)),
+            _backend_name(std::move(backend_name)),
+            _model_name(std::move(model_name)) {}
+
+    ~qnn_instance() {
+    }
+
+    int qnn_init(const QnnSaver_Config_t ** saver_config);
+
+    int qnn_finalize();
+
+    const qnn_interface & get_qnn_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_interface;
+    }
+
+    const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_raw_interface;
+    }
+
+    const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_raw_system_interface;
+    }
+
+    Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
+
+    Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
+
+    Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
+
+    Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
+
+    Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
+
+    QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
+
+    Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
+
+    int init_qnn_graph(const char * graph_name,
+                       bool debug,
+                       uint8_t do_node_validation = 1,
+                       const QnnGraph_Config_t ** graph_configs = nullptr
+    );
+    int init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
+
+    int finalize_qnn_graph();
+
+    bool is_valid_graph() const { return _qnn_graph_handle != nullptr; }
+
+    int init_htp_perfinfra();
+
+    int set_rpc_polling();
+
+    int set_high_performance_mode();
+
+    std::string & get_qnn_graph_name() { return _graph_name; }
+
+    bool is_rpcmem_initialized() {
+        return _rpcmem_initialized;
+    }
+
+    void set_rpcmem_initialized(bool initialized) {
+        _rpcmem_initialized = initialized;
+    }
+
+    size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
+    size_t get_rpcmem_usage() { return _rpcmem_usage; }
+
+    int32_t rpcmem_to_fd(void * buf);
+
+    int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor);
+    Qnn_MemHandle_t  register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type);
+
+    void unregister_rpcmem();
+    void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
+
+    void * alloc_rpcmem(size_t bytes, size_t alignment);
+    void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle);
+
+    void free_rpcmem(void * buf);
+    void free_rpcmem();
+
+    bool is_rpcmem_allocated(void * buf);
+
+    bool is_rpcmem_registered(Qnn_MemHandle_t handle) {
+        return _qnn_mem_set.count(handle) != 0U;
+    }
+
+    bool enable_qnn_rpc() {
+        return _enable_qnn_rpc;
+    }
+
+    QNNBackend get_device_id() {
+        return _device_id;
+    }
+
+public:
+    std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>> _qnn_graph_map;
+
+private:
+    int load_system();
+
+    int unload_system();
+
+    int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config);
+
+    int unload_backend();
+
+    void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) {
+        _qnn_raw_interface = raw_interface;
+    }
+
+    void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) {
+        _qnn_raw_system_interface = raw_interface;
+    }
+
+    void * alloc_rpcmem_internal(size_t bytes, size_t alignment);
+
+    void probe_device_meminfo();
+
+private:
+    static constexpr const int _required_num_providers = 1;
+
+private:
+    std::string     _lib_path;
+    std::string     _backend_name;
+    std::string     _model_name; // name of prebuilt QNN model, might be used in the future
+    BackendIdType   _backend_id;
+
+    bool _debug_tensor                      = false; // flag to indicate if requested graph is to be run in debug mode
+    bool _do_node_validations               = true;  // flag to indicate whether all add_node calls need to be validated
+    QnnLog_Level_t _qnn_log_level           = QNN_LOG_LEVEL_DEBUG;
+
+    ggml_qnn_profile_level _profile_level   = ggml_qnn_profile_level::profile_detail;
+
+    void * _system_lib_handle               = nullptr;
+
+    Qnn_GraphHandle_t _qnn_graph_handle     = nullptr;
+
+    Qnn_LogHandle_t _qnn_log_handle         = nullptr;
+
+    Qnn_ProfileHandle_t _qnn_profile_handle = nullptr;
+
+    Qnn_DeviceHandle_t _qnn_device_handle   = nullptr;
+
+    Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
+
+    Qnn_ContextHandle_t _qnn_context_handle = nullptr;
+
+    QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
+
+    QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr;
+    uint32_t _qnn_power_configid            = 1;
+    uint32_t _qnn_rpc_pollingtime           = 9999; // 0-10000 us for high performing
+
+    qnn_interface _qnn_interface;
+    QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
+    QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
+
+    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
+    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
+
+    static std::mutex _init_mutex;
+    static std::unordered_map<BackendIdType, void *> _loaded_lib_handle;
+    static std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
+    static std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
+
+    std::atomic_bool _rpcmem_initialized{false};
+    pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
+    pfn_rpc_mem_free _pfn_rpc_mem_free;
+    pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
+    pfn_rpc_mem_init  _pfn_rpc_mem_init;
+    pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
+    std::unordered_map<void *, void *> _rpcmem_store_map;
+    std::unordered_map<void *, size_t> _rpcmem_usage_map;
+    size_t                             _rpcmem_usage    = 0;   // mempool usage in Mbytes
+    size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
+
+    std::string _graph_name;
+    QNNBackend _device_id;
+    void * _rpc_lib_handle      = nullptr;
+    bool       _enable_qnn_rpc  = false; //TODO:unknown issue with QNN RPC feature
+
+    DISABLE_COPY(qnn_instance);
+    DISABLE_MOVE(qnn_instance);
+};
+
+
+std::mutex qnn_instance::_init_mutex;
+std::unordered_map<qnn_instance::BackendIdType, void *> qnn_instance::_loaded_lib_handle;
+std::unordered_map<std::string, qnn_instance::BackendIdType> qnn_instance::_lib_path_to_backend_id;
+std::unordered_map<qnn_instance::BackendIdType, const QnnInterface_t *> qnn_instance::_loaded_backend;
+
+void * qnn_instance::alloc_rpcmem_internal(size_t bytes, size_t alignment) {
+    if (!_rpcmem_initialized) {
+        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
+        return nullptr;
+    }
+
+    auto allocate_bytes = static_cast<int32_t>(bytes + alignment);
+    void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes);
+    if (nullptr == buf) {
+        GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
+        return nullptr;
+    }
+
+    auto aligned_buf = reinterpret_cast<void *>(ggmlqnn_align_to(alignment,
+                                                reinterpret_cast<intptr_t>(buf)));
+    bool status = _rpcmem_store_map.insert(std::pair<void *, void *>(aligned_buf, buf)).second;
+    if (!status) {
+        GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
+        _pfn_rpc_mem_free(buf);
+    }
+    return aligned_buf;
+}
+
+void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
+    if (_rpcmem_usage > (_rpcmem_capacity - 8)) { // reserve 8Mbytes in rpc mempool
+        GGMLQNN_LOG_WARN("rpc mempool capcaity: %d MB, usage: %d MB", _rpcmem_capacity, _rpcmem_usage);
+        return nullptr;
+    }
+
+    auto aligned_buf = alloc_rpcmem_internal(bytes, alignment);
+    if (nullptr == aligned_buf)
+        return nullptr;
+    _rpcmem_usage_map.insert(std::pair<void *, size_t>(aligned_buf, bytes));
+
+    size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
+    rpcmem_usage_in_bytes += bytes;
+    _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
+    return aligned_buf;
+}
+
+void qnn_instance::free_rpcmem(void * buf) {
+    size_t rpcbuffer_size = 0;
+    if (!_rpcmem_initialized) {
+        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
+    } else if (0 == _rpcmem_store_map.count(buf)) {
+        GGMLQNN_LOG_WARN("no allocated tensor\n");
+    } else {
+        GGMLQNN_LOG_DEBUG("free rpc mem %p", _rpcmem_store_map[buf]);
+        for (std::unordered_map<void *, size_t>::iterator it = _rpcmem_usage_map.begin();
+             it != _rpcmem_usage_map.end();
+             it++) {
+            void * rpcbuffer = it->first;
+            if (buf == rpcbuffer) {
+                rpcbuffer_size = it->second;
+                size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
+                rpcmem_usage_in_bytes -= rpcbuffer_size;
+                _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
+            }
+        }
+        if (rpcbuffer_size != 0) {
+            _rpcmem_usage_map.erase(buf);
+        } else {
+            GGMLQNN_LOG_WARN("it shouldn't happen, pls check why?");
+        }
+        _pfn_rpc_mem_free(_rpcmem_store_map[buf]);
+        _rpcmem_store_map.erase(buf);
+    }
+}
+
+void qnn_instance::free_rpcmem() {
+    if (_rpcmem_store_map.empty()) {
+        GGMLQNN_LOG_WARN("no rpcmem allocated\n");
+        return;
+    }
+
+    for (std::unordered_map<void *, void *>::iterator it = _rpcmem_store_map.begin();
+         it != _qnn_mem_set.end();
+         it++) {
+        void * rpcbuffer = it->second;
+        GGMLQNN_LOG_DEBUG("free rpc buffer %p", rpcbuffer);
+        _pfn_rpc_mem_free(rpcbuffer);
+    }
+    _rpcmem_store_map.clear();
+    _rpcmem_usage_map.clear();
+    _rpcmem_usage = 0;
+}
+
+int32_t qnn_instance::rpcmem_to_fd(void * buf) {
+    int32_t mem_fd = -1;
+    if (!is_rpcmem_initialized()) {
+        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
+    } else {
+        mem_fd = _pfn_rpc_mem_to_fd(buf);
+    }
+
+    return mem_fd;
+}
+
+int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
+    if (nullptr == p_data || (nullptr == p_tensor)) {
+        GGMLQNN_LOG_WARN("invalid param\n");
+        return 1;
+    }
+
+    if (!is_rpcmem_initialized()) {
+        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
+        return 2;
     }
 
     if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
@@ -2238,7 +2721,7 @@ void qnn_instance::probe_device_meminfo() {
     GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
 }
 
-uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) {
+static uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) {
     if (nullptr == instance || nullptr == ggml_tensor || nullptr == qnn_tensor) {
         GGMLQNN_LOG_WARN("invalid params\n");
         return nullptr;
@@ -2257,7 +2740,7 @@ uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor *
     return qnn_rpcbuffer;
 }
 
-void ggmlqnn_print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggmlqnn_print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     //skip sanity check of params
     if (nullptr != func_name && nullptr != ctx) {
         GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
@@ -2286,6 +2769,126 @@ static void dump_op_info(const struct ggml_tensor * tensor) {
     ggmlqnn_print_tensors_info(nullptr, nullptr, src0, src1, dst);
 }
 
+static Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
+                                                    Qnn_TensorType_t qnn_tensor_type,
+                                                    Qnn_DataType_t qnn_data_type,
+                                                    uint32_t rank, uint32_t * dims,
+                                                    void * data, uint32_t data_size,
+                                                    bool b_transpose) {
+    Qnn_ErrorHandle_t error         = QNN_SUCCESS;
+    char tensor_name[GGML_MAX_NAME] = {};
+
+    //ensure the tensor name is unique
+    if (nullptr == name) {
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", get_idx());
+    } else {
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, get_idx());
+    }
+    GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
+    inc_idx();
+
+    uint32_t reverse_dims[GGML_MAX_DIMS]    = {};
+    uint32_t transpose_dims[GGML_MAX_DIMS]  = {};
+    uint32_t * tensor_dims                  = nullptr;
+    //case 1:use dims info from ggml tensor
+    if (nullptr != tensor) {
+        //there are different dimension order between ggml tensor and qnn tensor
+        for (size_t idx = 0; idx < rank; idx++) {
+            reverse_dims[idx] = (uint32_t)tensor->ne[rank - 1 - idx];
+        }
+        tensor_dims = reverse_dims;
+    }
+    //case 2: use user's specified tensor_dims
+    if (nullptr != dims) {
+        tensor_dims = dims;
+    }
+    //case 3: transpose for dst tensor
+    if (b_transpose) {
+        GGML_ASSERT(tensor != nullptr); //ensure ggml_tensor is not nullptr for this special case
+
+        get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_n_dims(tensor));
+        tensor_dims = transpose_dims;
+#if 0
+        for (size_t idx = 0; idx < 4; idx++) {
+            GGMLQNN_LOG_DEBUG("origin dim[%d]=%d\n", idx, reverse_dims[idx]);
+        }
+        for (size_t idx = 0; idx < 4; idx++) {
+            GGMLQNN_LOG_DEBUG("trans  dim[%d]=%d\n", idx, transpose_dims[idx]);
+        }
+#endif
+    }
+
+    Qnn_Tensor_t qnn_tensor = {
+            .version= QNN_TENSOR_VERSION_1,
+            {.v1= {
+                    .id = 0,
+                    .name = tensor_name,
+                    .type = qnn_tensor_type,
+                    .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
+                    .dataType = qnn_data_type,
+                    .quantizeParams = {.encodingDefinition = QNN_DEFINITION_UNDEFINED,
+                            .quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                            {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}},
+                    .rank = rank,
+                    .dimensions = tensor_dims,
+                    .memType = QNN_TENSORMEMTYPE_RAW,
+                    .clientBuf = {.data = nullptr, .dataSize = 0}
+            }
+            }
+    };
+    if (nullptr != name) {
+        QNN_VER_PTR(qnn_tensor)->name = name;
+    }
+    Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
+    if (nullptr == p_qnn_tensor) {
+        GGMLQNN_LOG_WARN("calloc failed");
+        return nullptr;
+    }
+    error = deep_copy_qnn_tensors(qnn_tensor, * p_qnn_tensor);
+    if (error != QNN_SUCCESS) {
+        free(p_qnn_tensor);
+        GGMLQNN_LOG_WARN("init tensor failed");
+        return  nullptr;
+    }
+    QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {data, data_size};
+
+    return p_qnn_tensor;
+}
+
+static Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle, const ggml_tensor * tensor, Qnn_TensorType_t tensor_type) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    uint32_t dimensions[]   = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
+                               (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
+    Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
+    Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+
+    if (0 == tensor->flags) {
+        qnn_tensor_type = tensor_type;
+    } else {
+        if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
+            qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+        } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
+            qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
+        }
+    }
+
+    qnn_data_type = ggmlqnn_datatype_from_ggml_datatype(tensor->type);
+    Qnn_Tensor_t * p_qnn_tensor = ggmlqnn_create_general_tensor(tensor, nullptr,
+                                                                qnn_tensor_type, qnn_data_type,
+                                                                ggml_n_dims(tensor), dimensions,
+                                                                nullptr, 0);
+
+    bool enable_npu_rpc = (instance->enable_qnn_rpc() && instance->get_device_id() == QNN_BACKEND_NPU);
+    if (enable_npu_rpc) {
+        QNN_VER_PTR(*p_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+        QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {.data=nullptr, .dataSize=0};
+    }
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = instance->get_qnn_raw_interface();
+    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_qnn_tensor));
+
+    return p_qnn_tensor;
+}
+
 // =================================================================================================
 //  section-6: implementation of ggml-qnn backend
 // =================================================================================================
@@ -3066,3 +3669,669 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
 }
 
 GGML_BACKEND_DL_IMPL(ggml_backend_qnn_reg)
+
+// =================================================================================================
+//  section-7: offload GGML op to QNN backend
+// =================================================================================================
+static inline uint32_t ggmlqnn_get_tensor_data_size(const ggml_tensor * tensor) {
+    /*
+    size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
+    size_t n_dims = ggml_get_tensor_rank(tensor);
+    for (int i = 1; i < n_dims; i++) {
+        data_size *= tensor->ne[i];
+    }
+
+    return data_size;
+    */
+    return ggml_nbytes(tensor);
+}
+
+static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
+                                           const ggml_tensor * src1, ggml_tensor * dst) {
+    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
+        GGMLQNN_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    qnn_instance * instance = ctx->instance;
+    if (nullptr == instance) {
+        GGMLQNN_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    return true;
+}
+
+#define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                          \
+    do {                                                                    \
+        if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {       \
+            return;                                                         \
+        }                                                                   \
+    } while (0)
+
+/*
+ * provide a general skeleton to offload ggml op to QNN backend: a single node contains 2 input
+ * tensor and 1 output tensor
+*/
+void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
+    qnn_instance * instance                     = nullptr;
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    Qnn_Tensor_t * p_tensor0                    = nullptr;
+    Qnn_Tensor_t * p_tensor1                    = nullptr;
+    Qnn_Tensor_t * p_tensor2                    = nullptr;
+    const ggml_tensor * src0                    = op->src[0];
+    const ggml_tensor * src1                    = op->src[1];
+    ggml_tensor * dst                           = op;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+    size_t qnn_op_index                         = ggmlqnn_get_op_index(op);
+    GGML_ASSERT(qnn_op_index < ggmlqnn_get_opcaps_size());
+    const char * qnn_op_name                    = ggmlqnn_k_op_caps[qnn_op_index].qnn_op_name;
+    std::string ggml_op_name_string             = std::string("ggml_") + ggml_op_name(op->op);
+    const char * ggml_op_name                   = ggml_op_name_string.c_str();
+
+    qnn_perf op_perf                            = qnn_perf(ggml_op_name);
+    op_perf.start();
+
+    //ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
+    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
+
+    std::string graph_name;
+    ggmlqnn_get_graphkey_from_op(op, graph_name);
+    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+        //retrieve computational resource from cached QNN graph
+        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
+        graph_handle            = std::get<0>(graph_item);
+        qnn_tensors_t & tensor  = std::get<1>(graph_item);
+        p_tensor0               = tensor[0];
+        p_tensor1               = tensor[1];
+        p_tensor2               = tensor[2];
+    } else {
+        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
+        GGML_ASSERT(instance->get_device_id() == ctx->device);
+        //create QNN graph
+        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
+            return;
+        }
+        graph_handle = instance->get_qnn_graph_handle();
+
+        //create computational tensor
+        p_tensor0 = ggmlqnn_create_compute_tensor(instance, graph_handle, src0, QNN_TENSOR_TYPE_APP_WRITE);
+        p_tensor1 = ggmlqnn_create_compute_tensor(instance, graph_handle, src1, QNN_TENSOR_TYPE_APP_WRITE);
+        p_tensor2 = ggmlqnn_create_compute_tensor(instance, graph_handle, dst,  QNN_TENSOR_TYPE_APP_READ);
+
+        //compose QNN graph
+        Qnn_Tensor_t tensor_inputs[] = {
+                *p_tensor0,
+                *p_tensor1
+        };
+        Qnn_Tensor_t tensor_outputs[] = {
+                *p_tensor2
+        };
+        Qnn_OpConfig_t op_config = {
+                QNN_OPCONFIG_VERSION_1, {
+                        ggml_op_name,
+                        QNN_OP_PACKAGE_NAME_QTI_AISW,
+                        qnn_op_name,
+                        0,
+                        nullptr,
+                        2,
+                        tensor_inputs,
+                        1,
+                        tensor_outputs
+                }
+        };
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
+        //finalize QNN graph
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+
+        //cache QNN graph
+        qnn_tensors_t ggml_op_add_tensors;
+        ggml_op_add_tensors.reserve(3);
+        ggml_op_add_tensors.push_back(p_tensor0);
+        ggml_op_add_tensors.push_back(p_tensor1);
+        ggml_op_add_tensors.push_back(p_tensor2);
+        auto  graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
+        instance->_qnn_graph_map[graph_name] = graph_item;
+    }
+
+    if (enable_npu_rpc) {
+        uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
+        GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
+        if (nullptr != qnn_buffer_0) {
+            memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
+        }
+
+        uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor1)->memHandle));
+        GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
+        if (nullptr != qnn_buffer_1) {
+            memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
+        }
+    } else {
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+    }
+
+    Qnn_Tensor_t tensor_inputs[] = {
+            *p_tensor0,
+            *p_tensor1
+    };
+    Qnn_Tensor_t tensor_outputs[] = {
+            *p_tensor2
+    };
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                        tensor_inputs, 2,
+                                                        tensor_outputs, 1,
+                                                        nullptr, nullptr));
+    if (enable_npu_rpc) {
+        //TODO:NPU RPC feature will failed with test-backend-ops
+        uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
+        if (nullptr != qnn_buffer_2) {
+            memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
+        }
+    }
+
+#if GGMLQNN_PRINT_OP_ADD_LOG
+    op_perf.info();
+#endif
+}
+
+/*
+ * this function is AI-assisted code from Grok 3 for purpose of offload 4d matrix mulmat to QNN backend
+ * UT in ggml-qnn-ut.cpp passed:
+ * ./scripts/build-run-android.sh run_ut_mulmat 0
+ * ./scripts/build-run-android.sh run_ut_mulmat 1
+ * ./scripts/build-run-android.sh run_ut_mulmat 2
+ *
+ * the logic of ggml_qnn_mul_mat_4d is similar to ggml_qnn_mul_mat but much more complicated
+ * than ggml_qnn_mul_mat, so it's a standalone function.
+ * it will be combined with ggml_qnn_mul_mat in the future
+ */
+static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    bool graph_initialized = false;
+    qnn_perf op_perf = qnn_perf("ggml_qnn_mul_mat_4d");
+    qnn_instance *instance = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
+
+    const ggml_tensor *src0 = op->src[0];
+    const ggml_tensor *src1 = op->src[1];
+    ggml_tensor *dst = op;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4);
+    op_perf.start();
+
+    std::string graph_name;
+    ggmlqnn_get_graphkey_from_op(op, graph_name);
+    GGMLQNN_LOG_DEBUG("graph name %s\n", graph_name.c_str());
+
+    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
+
+    Qnn_GraphHandle_t graph_handle = nullptr;
+    Qnn_Tensor_t *p_tensor0 = nullptr;
+    Qnn_Tensor_t *p_reshape0_out = nullptr;
+    Qnn_Tensor_t *p_tile0_out = nullptr;
+    Qnn_Tensor_t *p_tensor1 = nullptr;
+    Qnn_Tensor_t *p_permute1_out = nullptr;
+    Qnn_Tensor_t *p_reshape1_out = nullptr;
+    Qnn_Tensor_t *p_matmul_out = nullptr;
+    Qnn_Tensor_t *p_reshape2_out = nullptr;
+
+    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+        graph_initialized = true;
+        qnn_res_t &graph_item = instance->_qnn_graph_map[graph_name];
+        graph_handle = std::get<0>(graph_item);
+        qnn_tensors_t &tensors = std::get<1>(graph_item);
+        p_tensor0 = tensors[0];
+        p_reshape0_out = tensors[1];
+        p_tile0_out = tensors[2];
+        p_tensor1 = tensors[3];
+        p_permute1_out = tensors[4];
+        p_reshape1_out = tensors[5];
+        p_matmul_out = tensors[6];
+        p_reshape2_out = tensors[7];
+    } else {
+        CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
+                                                           graph_name.c_str(), NULL, &graph_handle));
+
+        // Define dimensions
+        uint32_t K = src0->ne[0];               // Inner dimension
+        uint32_t M = src0->ne[1];               // Rows of src0
+        uint32_t N = src1->ne[1];               // Columns of src1
+        uint32_t B0 = src0->ne[2] * src0->ne[3]; // src0 batch
+        uint32_t B1 = src1->ne[2] * src1->ne[3]; // src1 batch (drives output)
+
+        // Validate K only
+        GGML_ASSERT(src0->ne[0] == src1->ne[0]); // K must match
+
+        // src0: [K, M, H0, B0] -> QNN: [B0, H0, M, K]
+        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
+        p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
+                          src0_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+
+        // Reshape src0 to [B0, M, K]
+        uint32_t reshape0_out_dims[] = {B0, M, K};
+        p_reshape0_out = GQCGT(nullptr, "reshape0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                               reshape0_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape0_out));
+        Qnn_Tensor_t reshape0_inputs[] = {*p_tensor0};
+        Qnn_Tensor_t reshape0_outputs[] = {*p_reshape0_out};
+        Qnn_OpConfig_t reshape0_op = ggmlqnn_create_op_config("reshape0", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                              QNN_OP_RESHAPE, nullptr, 0,
+                                                              reshape0_inputs, 1, reshape0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape0_op));
+
+        // Tile src0 to match B1: [B0, M, K] -> [B1, M, K]
+        uint32_t tile0_out_dims[] = {B1, M, K};
+        p_tile0_out = GQCGT(nullptr, "tile0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                            tile0_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tile0_out));
+        uint32_t tile_multiples[] = {B1 / B0, 1, 1};
+        uint32_t tile_dims[] = {3};
+        Qnn_Tensor_t *p_tile_multiples = GQCGT(nullptr, "tile_multiples", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                               tile_dims, tile_multiples, sizeof(tile_multiples));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tile_multiples));
+        Qnn_Param_t tile_params[] = {{QNN_PARAMTYPE_TENSOR, "multiples", .tensorParam = *p_tile_multiples}};
+        Qnn_Tensor_t tile0_inputs[] = {*p_reshape0_out};
+        Qnn_Tensor_t tile0_outputs[] = {*p_tile0_out};
+        Qnn_OpConfig_t tile0_op = ggmlqnn_create_op_config("tile0", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                           QNN_OP_TILE, tile_params, 1,
+                                                           tile0_inputs, 1, tile0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, tile0_op));
+
+        // src1: [N, K, H1, B1] -> QNN: [B1, H1, N, K]
+        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
+        p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
+                          src1_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+
+        // Permute src1 to [B1, H1, K, N]
+        uint32_t perm_data[] = {0, 1, 3, 2};
+        uint32_t perm_dims[] = {4};
+        Qnn_Tensor_t *p_perm = GQCGT(nullptr, "perm", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                     perm_dims, perm_data, sizeof(perm_data));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_perm));
+        uint32_t permute1_out_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1])};
+        p_permute1_out = GQCGT(nullptr, "permute1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
+                               permute1_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_permute1_out));
+        Qnn_Param_t permute1_params[] = {{QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_perm}};
+        Qnn_Tensor_t permute1_inputs[] = {*p_tensor1};
+        Qnn_Tensor_t permute1_outputs[] = {*p_permute1_out};
+        Qnn_OpConfig_t permute1_op = ggmlqnn_create_op_config("permute1", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                              QNN_OP_TRANSPOSE, permute1_params, 1,
+                                                              permute1_inputs, 1, permute1_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, permute1_op));
+
+        // Reshape src1 to [B1, K, N]
+        uint32_t reshape1_out_dims[] = {B1, K, N};
+        p_reshape1_out = GQCGT(nullptr, "reshape1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                               reshape1_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape1_out));
+        Qnn_Tensor_t reshape1_inputs[] = {*p_permute1_out};
+        Qnn_Tensor_t reshape1_outputs[] = {*p_reshape1_out};
+        Qnn_OpConfig_t reshape1_op = ggmlqnn_create_op_config("reshape1", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                              QNN_OP_RESHAPE, nullptr, 0,
+                                                              reshape1_inputs, 1, reshape1_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape1_op));
+
+        // MatMul: [B1, M, K] x [B1, K, N] -> [B1, M, N]
+        uint32_t matmul_out_dims[] = {B1, M, N};
+        p_matmul_out = GQCGT(nullptr, "matmul_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                             matmul_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_matmul_out));
+        Qnn_Tensor_t matmul_inputs[] = {*p_tile0_out, *p_reshape1_out};
+        Qnn_Tensor_t matmul_outputs[] = {*p_matmul_out};
+        Qnn_OpConfig_t matmul_op = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                            QNN_OP_MAT_MUL, nullptr, 0,
+                                                            matmul_inputs, 2, matmul_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
+
+        // Output: [N, M, H1, B1] -> QNN: [B1, H1, M, N]
+        uint32_t reshape2_out_dims[] = {static_cast<uint32_t>(dst->ne[3]), static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[0])};
+        p_reshape2_out = GQCGT(dst, "output", QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
+                               reshape2_out_dims, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape2_out));
+        Qnn_Tensor_t reshape2_inputs[] = {*p_matmul_out};
+        Qnn_Tensor_t reshape2_outputs[] = {*p_reshape2_out};
+        Qnn_OpConfig_t reshape2_op = ggmlqnn_create_op_config("reshape2", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                              QNN_OP_RESHAPE, nullptr, 0,
+                                                              reshape2_inputs, 1, reshape2_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape2_op));
+
+        // Finalize
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
+
+        // Cache
+        qnn_tensors_t ggml_op_mulmat_tensors = {p_tensor0, p_reshape0_out, p_tile0_out, p_tensor1, p_permute1_out, p_reshape1_out, p_matmul_out, p_reshape2_out};
+        instance->_qnn_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+    }
+
+    // Execute
+    QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
+    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
+    QNN_VER_PTR(*p_reshape2_out)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
+
+    Qnn_Tensor_t input_tensors[] = {*p_tensor0, *p_tensor1};
+    Qnn_Tensor_t output_tensors[] = {*p_reshape2_out};
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2,
+                                                        output_tensors, 1, NULL, NULL));
+
+#if 0
+    // Log dst for debugging
+    float *dst_data = (float *)dst->data;
+    GGMLQNN_LOG_DEBUG("dst shape: [%d, %d, %d, %d]\n", dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
+    for (int i = 0; i < dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3]; i++) {
+        GGMLQNN_LOG_DEBUG("dst[%d] = %f\n", i, dst_data[i]);
+    }
+#endif
+
+    op_perf.info();
+}
+
+/*
+ * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
+ *        using the QNN backend. this function performs matrix multiplication of the input tensor
+ *        `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
+ *        and stores the result in the destination tensor `dst`.
+ *
+         there are two key-points in properly handling how to offload mulmat to the QNN backend in ggml-qnn
+         1. transpose
+            a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from:
+            struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
+            which like this:
+            +---+---+
+            | 0 | 1 |
+            +---+---+
+            | 2 | 3 |
+            +---+---+
+            | 4 | 5 |
+            +---+---+
+            with
+                ne[0] = 2
+                ne[1] = 3
+            there are different dimension order between ggml tensor and qnn tensor
+
+          2. QNN's MatMul can only support input tensors with rank >= 2
+
+             in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
+             operation when offloading mulmat to QNN backend. this implementation will handle transpose
+             in func ggml_qnn_create_general_tensor()
+ *
+ *        this function is a good example to illustrated the second technical approach "mapping the
+ *        entire ggml computational graph to QNN graph" without complex C++ encapsulation. or another
+ *        pipeline of "how to utilize the Hexagon NPU maximally through QNN SDK", details could be found at
+ *        https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
+ *
+ * @param ctx     the context of ggml-qnn backend
+ * @param op      the destination tensor where the result of the matrix multiplication will be stored.
+ *
+ * @note the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
+ *       than ggml_qnn_general_node. so it's a standalone function. accordingly, this is another
+ *       typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute
+ *       time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds
+ *       of MUL_MAT to compute:
+ *       mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
+ *       mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
+ *       mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, Q6_K...)
+ *                        and src1 is F32, src0 -> f32 in src0', then src0' * src1
+*/
+void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
+    qnn_perf op_perf                            = qnn_perf("ggml_qnn_mul_mat");
+    qnn_instance * instance                     = nullptr;
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    Qnn_Tensor_t * p_tensor0                    = nullptr;
+    Qnn_Tensor_t * p_tensor1                    = nullptr;
+    Qnn_Tensor_t * p_tensor2                    = nullptr;
+    Qnn_Tensor_t * p_param_tensor               = nullptr;
+    Qnn_Tensor_t * p_tensor2_transpose          = nullptr;
+    const ggml_tensor * src0                    = op->src[0];
+    const ggml_tensor * src1                    = op->src[1];
+    ggml_tensor       * dst                     = op;
+
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+    op_perf.start();
+
+    const enum ggml_type src0_type              = src0->type;
+    const uint32_t src0_rank                    = ggml_n_dims(src0);
+    const uint32_t src1_rank                    = ggml_n_dims(src1);
+    GGML_ASSERT(src0_rank == src1_rank);
+    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
+    if (4 == src0_rank) {
+        return ggml_qnn_mul_mat_4d(ctx, op);
+    }
+    void * wdata                                = ggmlqnn_type_trait(ctx, op);
+    const size_t desired_size                   = ctx->desired_size;
+
+    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
+
+    std::string graph_name;
+    ggmlqnn_get_graphkey_from_op(op, graph_name);
+    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+        //retrieve computational resource from cached QNN graph
+        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
+        graph_handle            = std::get<0>(graph_item);
+        qnn_tensors_t & tensors = std::get<1>(graph_item);
+        p_tensor0               = tensors[0];
+        p_tensor1               = tensors[1];
+        p_tensor2               = tensors[2];
+        p_param_tensor          = tensors[3];
+        p_tensor2_transpose     = tensors[4];
+    } else {
+        //create QNN graph
+        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
+        error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle);
+        if (QNN_SUCCESS != error) {
+            GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
+            return;
+        }
+
+        //create computational tensor
+        p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor1 = GQCGT(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        p_tensor2 = GQCGT(dst, nullptr, QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
+
+        //create param tensor for offload 2d/3d/4d matrix multiplication
+        const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
+                {0},
+                {1, 0},
+                {0, 2, 1},
+                {0, 1, 3, 2},
+        };
+        uint32_t param_tensor_dims[1] = {src0_rank};
+        p_param_tensor = GQCGT(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, param_tensor_dims, (void *)(param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
+
+        //create transpose tensor
+        p_tensor2_transpose = GQCGT(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0, true);
+        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2_transpose));
+
+        //compose QNN graph: add mulmat node
+        Qnn_Param_t out_0_params[]   = {{QNN_PARAMTYPE_SCALAR, QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}}};
+        Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0, *p_tensor1};
+        Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
+        Qnn_OpConfig_t out_0         = ggmlqnn_create_op_config("mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
+
+        //compose QNN graph: add transpose node
+        Qnn_Param_t out_trans1_0_params[]   = { {QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_param_tensor}};
+        Qnn_Tensor_t out_trans1_0_inputs[]  = {*p_tensor2_transpose};
+        Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
+        Qnn_OpConfig_t out_trans1_0         = ggmlqnn_create_op_config("mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
+
+        //finalize QNN graph
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+
+        //cache QNN graph
+        qnn_tensors_t ggml_op_mulmat_tensors;
+        ggml_op_mulmat_tensors.reserve(5);
+        ggml_op_mulmat_tensors.push_back(p_tensor0);
+        ggml_op_mulmat_tensors.push_back(p_tensor1);
+        ggml_op_mulmat_tensors.push_back(p_tensor2);
+        ggml_op_mulmat_tensors.push_back(p_param_tensor);
+        ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
+        auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+        instance->_qnn_graph_map[graph_name] = graph_item;
+    }
+
+    if (src0_type != GGML_TYPE_F32) {
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
+    } else {
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+    }
+    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+    QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+
+    Qnn_Tensor_t tensor_inputs[] = {
+            *p_tensor0,
+            *p_tensor1
+    };
+    Qnn_Tensor_t tensor_outputs[] = {
+            *p_tensor2
+    };
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                        tensor_inputs, 2,
+                                                        tensor_outputs, 1,
+                                                        nullptr, nullptr));
+    op_perf.info();
+}
+
+void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(value);
+}
+
+void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    ggml_qnn_dup(ctx, dst);
+}
+
+void ggml_qnn_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
+
+void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 5e69024298dbe..5b5e55aa2f7b6 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -147,6 +147,7 @@ function prepare_run_on_phone()
     adb shell chmod +x ${REMOTE_PATH}/${program}
 }
 
+
 function run_llamacli()
 {
     prepare_run_on_phone llama-cli
@@ -212,35 +213,6 @@ function run_test-op()
 
 }
 
-function run_ut_add()
-{
-    prepare_run_on_phone ggml-qnn-ut
-
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/ggml-qnn-ut -t GGML_OP_ADD -b $qnnbackend"
-
-}
-
-function run_ut_mulmat()
-{
-    prepare_run_on_phone ggml-qnn-ut
-
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/ggml-qnn-ut -t GGML_OP_MUL_MAT -b $qnnbackend"
-
-}
-
-function run_ut_mul()
-{
-    prepare_run_on_phone ggml-qnn-ut
-
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/ggml-qnn-ut -t GGML_OP_MUL -b $qnnbackend"
-
-}
 
 function print_oplist()
 {
@@ -330,10 +302,7 @@ function show_usage()
     echo "  $0 build"
     echo "  $0 updateqnnlib"
     echo "  $0 run_testops"
-    echo "  $0 run_testop          [ADD/MUL/MUL_MAT]  [0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU)]"
-    echo "  $0 run_ut_add          0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo "  $0 run_ut_mulmat       0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo "  $0 run_ut_mul          0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_testop          [ADD/MUL/MUL_MAT/...(op from print_oplist)]  [0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU)]"
     echo "  $0 run_llamacli        0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
     echo "  $0 run_llamabench      0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
 
@@ -374,31 +343,20 @@ elif [ $# == 1 ]; then
     fi
 elif [ $# == 2 ]; then
     qnnbackend=$2
-    if [ ${qnnbackend} -gt 3 ]; then
-        show_usage
-        exit 1
-    fi
-
     if [ "$1" == "run_llamacli" ]; then
         run_llamacli
         exit 0
     elif [ "$1" == "run_llamabench" ]; then
         run_llamabench
         exit 0
-    elif [ "$1" == "run_ut_add" ]; then
-        run_ut_add
-        exit 0
-    elif [ "$1" == "run_ut_mulmat" ]; then
-        run_ut_mulmat
-        exit 0
-    elif [ "$1" == "run_ut_mul" ]; then
-        run_ut_mul
         exit 0
+    else
+        show_usage
+        exit 1
     fi
 elif [ $# == 3 ]; then
+    #opname can be found via print_oplist:
     opname=$2
-#TODO: check opname in oplist
-#opname can be found via print_oplist:
 
     qnnbackend=$3
     if [ ${qnnbackend} -gt 3 ]; then
diff --git a/scripts/build-run-windows.sh b/scripts/build-run-windows.sh
deleted file mode 100755
index c9a5b13d71d4c..0000000000000
--- a/scripts/build-run-windows.sh
+++ /dev/null
@@ -1,222 +0,0 @@
-#!/bin/bash
-# build llama.cpp or llama.cpp + ggml-qnn for Windows with cygwin on Windows
-# build llama.cpp + ggml-qnn for Snapdragon desktop SoC equipped WoA(Windows on ARM) with cygwin on Windows
-
-# items marked TODO has not verified yet
-
-set -e
-
-
-PWD=`pwd`
-PREFIX_PATH=/cygdrive/c
-GGUF_MODEL_NAME=${PREFIX_PATH}/qwen1_5-1_8b-chat-q4_0.gguf
-PROJECT_HOME_PATH=`pwd`
-
-#QNN SDK could be found at:
-#https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
-#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
-QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
-QNN_SDK_PATH=${PREFIX_PATH}/qairt/2.31.0.250130/
-
-#default is QNN NPU
-qnnbackend=2
-
-function dump_vars()
-{
-    echo -e "QNN_SDK_PATH:         ${QNN_SDK_PATH}"
-}
-
-
-function show_pwd()
-{
-    echo -e "current working path:$(pwd)\n"
-}
-
-
-function check_qnn_sdk()
-{
-    if [ ! -d ${QNN_SDK_PATH} ]; then
-        echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from ${QNN_SDK_URL}...\n"
-        exit 1
-    fi
-}
-
-function build_windows_x86
-{
-    echo "build_windows_x86-without-qnn"
-    cmake -H. -B./out/windows_x86 -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF
-    cd out/windows_x86
-    make -j16
-    show_pwd
-
-    cd -
-}
-
-function build_windows_x86_qnn
-{
-    echo "build_windows_x86-with-qnn"
-    cmake -H. -B./out/windows_x86_qnn -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
-    cd out/windows_x86_qnn
-    make -j16
-    show_pwd
-
-    cd -
-}
-
-#TODO
-function build_windows_arm64_qnn
-{
-    echo "build_windows_arm64 not supported now"
-    echo "cmake source dir:${PROJECT_HOME_PATH}"
-    cmake -H. -B./out/windows_arm64_qnn -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DGGML_QNN=ON -DCMAKE_TOOLCHAIN_FILE=${PROJECT_HOME_PATH}/cmake/arm64-windows-llvm.cmake -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
-    #cmake -H. -B./out/windows_arm64_qnn -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DGGML_QNN=ON -DCMAKE_TOOLCHAIN_FILE=${PROJECT_HOME_PATH}/cmake/arm64-windows-cygwin.cmake -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
-    cd out/windows_arm64_qnn
-    make -j16
-    show_pwd
-
-    cd -
-}
-
-
-function remove_temp_dir()
-{
-    if [ -d out/windows_x86 ]; then
-        echo "remove out/windows_x86 directory in `pwd`"
-        rm -rf out/windows_x86
-    fi
-}
-
-
-function check_qnn_libs()
-{
-    echo "do nothing"
-}
-
-
-function update_qnn_libs()
-{
-    echo "do nothing"
-}
-
-function build_x86()
-{
-    show_pwd
-    check_qnn_sdk
-    dump_vars
-    #some unexpected behaviour on Windows
-    #remove_temp_dir
-    build_windows_x86
-}
-
-function build_x86_qnn()
-{
-    show_pwd
-    check_qnn_sdk
-    dump_vars
-    #some unexpected behaviour on Windows
-    #remove_temp_dir
-    build_windows_x86_qnn
-}
-
-function build_arm64_qnn()
-{
-    show_pwd
-    check_qnn_sdk
-    dump_vars
-    #some unexpected behaviour on Windows
-    #remove_temp_dir
-    build_windows_arm64_qnn
-}
-
-function run_llamacli()
-{
-    check_qnn_libs
-    echo "not supported on Windows now"
-
-    #llama-cli -mg ${qnnbackend} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\"
-
-}
-
-
-function run_llamabench()
-{
-    check_qnn_libs
-    echo "not supported on Windows now"
-
-    #llama-bench -mg ${qnnbackend} -m ${GGUF_MODEL_NAME}"
-
-}
-
-
-function run_test-backend-ops()
-{
-    check_qnn_libs
-    echo "not supported on Windows now"
-
-    #test-backend-ops test"
-
-}
-
-
-function show_usage()
-{
-    echo "Usage:"
-    echo "  $0 build_x86"
-    echo "  $0 build_x86_qnn"
-    echo "  $0 build_arm64_qnn"
-    echo "  $0 run_testop"
-    echo "  $0 run_llamacli     0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo "  $0 run_llamabench   0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo -e "\n\n\n"
-}
-
-
-show_pwd
-
-check_qnn_sdk
-
-if [ $# == 0 ]; then
-    show_usage
-    exit 1
-elif [ $# == 1 ]; then
-    if [ "$1" == "-h" ]; then
-        show_usage
-        exit 1
-    elif [ "$1" == "help" ]; then
-        show_usage
-        exit 1
-    elif [ "$1" == "build_x86" ]; then
-        build_x86
-        exit 0
-    elif [ "$1" == "build_x86_qnn" ]; then
-        build_x86_qnn
-        exit 0
-    elif [ "$1" == "build_arm64_qnn" ]; then
-        build_arm64_qnn
-        exit 0
-
-    elif [ "$1" == "run_testop" ]; then
-        run_test-backend-ops
-        exit 0
-    else
-        show_usage
-        exit 1
-    fi
-elif [ $# == 2 ]; then
-    qnnbackend=$2
-    if [ ${qnnbackend} -gt 3 ]; then
-        show_usage
-        exit 1
-    fi
-
-    if [ "$1" == "run_llamacli" ]; then
-        run_llamacli
-        exit 0
-    elif [ "$1" == "run_llamabench" ]; then
-        run_llamabench
-        exit 0
-    fi
-else
-    show_usage
-    exit 1
-fi
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
index 7345eee2ea989..9da97f1bc5057 100644
--- a/src/llama-mmap.cpp
+++ b/src/llama-mmap.cpp
@@ -481,10 +481,10 @@ struct llama_mlock::impl {
         // Skip resource limit checks on visionOS/tvOS
         suggest = false;
 #else
-        struct rlimit lock_limit = {};
-        //if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
-        //    suggest = false;
-        //}
+        struct rlimit lock_limit;
+        if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit)) {
+            suggest = false;
+        }
         if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size)) {
             suggest = false;
         }

From 6ce5202f30435936ba90b3c2581323e28d3ecddb Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 11 Mar 2025 12:52:54 +0800
Subject: [PATCH 124/200] ggml-qnn: pr to upstream

---
 CMakeLists.txt                   |    1 -
 cmake/aarch64-w64-mingw32.cmake  |   18 -
 cmake/arm64-windows-cygwin.cmake |   16 -
 cmake/arm64-windows-llvm.cmake   |    4 +-
 ggml/src/ggml-qnn/CMakeLists.txt |    3 +-
 ggml/src/ggml-qnn/ggml-qnn.cpp   | 2606 +++++++++++++++++-------------
 scripts/build-run-android.sh     |   70 +-
 scripts/ggml-qnn.cfg             |    9 +
 tests/ggml-qnn-ut.cpp            |  480 ------
 9 files changed, 1583 insertions(+), 1624 deletions(-)
 delete mode 100644 cmake/aarch64-w64-mingw32.cmake
 delete mode 100644 cmake/arm64-windows-cygwin.cmake
 create mode 100644 scripts/ggml-qnn.cfg
 delete mode 100644 tests/ggml-qnn-ut.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f124bc2957472..de51c0a17b2f6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,6 @@ include(CheckIncludeFileCXX)
 set(CMAKE_WARN_UNUSED_CLI YES)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_VERBOSE_MAKEFILE on)
 
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
diff --git a/cmake/aarch64-w64-mingw32.cmake b/cmake/aarch64-w64-mingw32.cmake
deleted file mode 100644
index 775fa46337628..0000000000000
--- a/cmake/aarch64-w64-mingw32.cmake
+++ /dev/null
@@ -1,18 +0,0 @@
-#TODO
-#not work on Linux
-set( CMAKE_SYSTEM_NAME mingw )
-set( CMAKE_SYSTEM_PROCESSOR arm64 )
-
-set( target aarch64-w64-mingw32 )
-
-set( CMAKE_C_COMPILER    aarch64-w64-mingw32-gcc )
-set( CMAKE_CXX_COMPILER  aarch64-w64-mingw32-g++ )
-
-set( CMAKE_C_COMPILER_TARGET   ${target} )
-set( CMAKE_CXX_COMPILER_TARGET ${target} )
-
-#set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
-#set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
-
-set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
-set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
diff --git a/cmake/arm64-windows-cygwin.cmake b/cmake/arm64-windows-cygwin.cmake
deleted file mode 100644
index c7a313bb77adf..0000000000000
--- a/cmake/arm64-windows-cygwin.cmake
+++ /dev/null
@@ -1,16 +0,0 @@
-set( CMAKE_SYSTEM_NAME CYGWIN)
-set( CMAKE_SYSTEM_PROCESSOR arm64 )
-
-set( target aarch64-w64-cygwin)
-
-set( CMAKE_C_COMPILER    clang )
-set( CMAKE_CXX_COMPILER  clang++ )
-
-set( CMAKE_C_COMPILER_TARGET   ${target} )
-set( CMAKE_CXX_COMPILER_TARGET ${target} )
-
-set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
-set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
-
-set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
-set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
diff --git a/cmake/arm64-windows-llvm.cmake b/cmake/arm64-windows-llvm.cmake
index 983206032df3d..8023796800683 100644
--- a/cmake/arm64-windows-llvm.cmake
+++ b/cmake/arm64-windows-llvm.cmake
@@ -9,8 +9,8 @@ set( CMAKE_CXX_COMPILER  clang++ )
 set( CMAKE_C_COMPILER_TARGET   ${target} )
 set( CMAKE_CXX_COMPILER_TARGET ${target} )
 
-#set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
-#set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
+set( arch_c_flags "-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only" )
+set( warn_c_flags "-Wno-format -Wno-unused-variable -Wno-unused-function -Wno-gnu-zero-variadic-macro-arguments" )
 
 set( CMAKE_C_FLAGS_INIT   "${arch_c_flags} ${warn_c_flags}" )
 set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags} ${warn_c_flags}" )
diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt
index c11e2f82fa92b..fcbbc33a9b136 100644
--- a/ggml/src/ggml-qnn/CMakeLists.txt
+++ b/ggml/src/ggml-qnn/CMakeLists.txt
@@ -1,4 +1,5 @@
 message(STATUS "Using QNN backend")
+message("CMAKE_SYSTEM_NAME : ${CMAKE_SYSTEM_NAME}")
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     find_library(LOG_LIB log)
@@ -6,8 +7,6 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
     set(QNN_DEFAULT_LIB_SEARCH_PATH "C:\\" CACHE STRING "customized library search path for QNN backend")
-elseif(CMAKE_SYSTEM_NAME STREQUAL "CYGWIN")
-    set(QNN_DEFAULT_LIB_SEARCH_PATH "/cygdrive/c/qairt/2.31.0.250130/" CACHE STRING "customized library search path for QNN backend")
 else()
     message(FATAL_ERROR "QNN now only available on Android and Windows(Windows on ARM)")
 endif()
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 083f3ec466528..7c3477094ea9f 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -1,20 +1,23 @@
 /*
- * Copyright (c) 2024- KanTV authors
+ * Copyright (c) 2023-2024 The ggml authors
  *
  * Qualcomm QNN SDK and reference tech guides could be found at:
  * https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
  * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
  *
- * this single-source-file or self-contained implementation of ggml-qnn backend has seven sections:
- * section-1 does forward/external declaration,
- * section-2 defines ggml-qnn internal log function
- * section-3 does general helper macro / data structure / function
- * section-4 does QNN helper macro / data structure / function
- * section-5 does ggml-qnn backend helper macro / data structure / function / class
- * section-6 does implementation of ggml-qnn backend according to ggml's backend subsystem
- * section-7 does implementation of offload ggml op to QNN backend
+ * this single-source-file or self-contained implementation of ggml-qnn backend has 10 sections:
+ * section-1  forward/prototype declaration
+ * section-2  global vars, macros, data structures
+ * section-3  ggml-qnn internal troubleshooting function/class
+ * section-4  helper function for WoA(Windows on ARM)
+ * section-5  general helper function
+ * section-6  QNN helper function
+ * section-7  ggml-qnn backend helper function / class
+ * section-8  implementation of ggml-qnn backend according to ggml's backend subsystem
+ * section-9  implementation of offload ggml op to QNN backend
+ * section-10 illustrate why the second approach is actual an fake at the moment
  *
- * currently provide following ggml ops' QNN backend implementation in ggml-qnn-ops.cpp:
+ * currently provide following ggml op' QNN backend implementation:
  * - GGML_OP_ADD:    this is a simple skeleton, can expand other ggml ops according to expertise
  * - GGML_OP_MUL:    this is a simple skeleton, can expand other ggml ops according to expertise
  * - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex ggml ops accordingly
@@ -52,6 +55,7 @@
 #include <sys/stat.h>
 #include <sys/sysinfo.h>
 #include <unistd.h>
+#include <stdatomic.h>
 #endif
 
 #include <string>
@@ -65,6 +69,7 @@
 #include <fstream>
 #include <iostream>
 #include <sstream>
+#include <iomanip>
 #include <chrono>
 #include <memory>
 #include <regex>
@@ -72,10 +77,8 @@
 #include <functional>
 #include <unordered_map>
 #include <condition_variable>
-#include <cassert>
 #include <unordered_set>
 #include <utility>
-#include <stdatomic.h>
 #include <future>
 #if (defined __ANDROID__) || (defined ANDROID)
 #include "android/log.h"
@@ -105,22 +108,26 @@
 #include "ggml-backend-impl.h"
 
 // =================================================================================================
-//  section-1: forward/external declaration
+//  section-1: forward/prototype declaration
 // =================================================================================================
 class  qnn_instance;
 struct ggml_backend_qnn_context;
 typedef void (* ggmlqnn_op_func_t)(ggml_backend_qnn_context * ctx, ggml_tensor * op);
-
 static Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
                                              Qnn_TensorType_t qnn_tensor_type,
                                              Qnn_DataType_t qnn_data_type,
                                              uint32_t rank, uint32_t * dims,
                                              void * data, uint32_t data_size,
                                              bool b_transpose = false);
+static enum ggml_status ggmlqnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
+static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
 
+//op functions:
+//done
 static void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-
+//todo
 static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
@@ -134,1032 +141,1215 @@ static void ggml_qnn_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value);
-static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void ggml_qnn_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value);
 
 // =================================================================================================
-//  section-2: ggml-qnn internal troubleshooting function/class
+//  section-2: global var, macro, data structure
 // =================================================================================================
+// the following two vars can be fetched from [qnn_runtimelib_path]/ggml-qnn.cfg
+// [general]
+// print_qnn_internal_log=0
+// inference_approach=0
+static int g_print_qnn_internal_log         = 0; // enable/disable QNN's internal log
+static int g_inference_approach             = 0; // 0: general approach,similar to ggml-sycl or ggml-cann 1: mapping entire ggml cgraph to QNN graph
+static const char * g_qnn_cfgfilename       = "ggml-qnn.cfg";
+
+#if defined(__ANDROID__)
+//Android command line program
+static const char * g_qnn_runtimelib_path   = "/data/local/tmp/";
+#elif defined(__linux__)
+static const char * g_qnn_runtimelib_path   = "/tmp/";
+#elif defined(_WIN32)
+static const char * g_qnn_runtimelib_path   = "C:\\";
+#endif
+
+#if !defined(__ANDROID__) && !defined(__linux__)
+static std::atomic<int32_t> g_ggmltensor_idx(0); //ensure every QNN tensor name is unique
+#else
+static int32_t g_ggmltensor_idx = 0; //ensure every QNN tensor name is unique
+#endif
+
 #if 0//def NDEBUG
-#define GGMLQNN_DEBUG                           0
-#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
-#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
-#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
-#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            0
+#define GGMLQNN_DEBUG                                   0
+#define ENABLE_QNNBACKEND_PERF                          0
+#define GGMLQNN_PRINT_OP_ADD_LOG                        0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
+#define GGMLQNN_PRINT_OP_MUL_MAT_LOG                    0
 #else
-#define GGMLQNN_DEBUG                           1  // for troubleshooting QNN backend
-#define ENABLE_QNNBACKEND_PERF                  0  // enable/disable op's perf info
-#define GGMLQNN_PRINT_QNN_INTERNAL_LOG          0  // enable/disable QNN's internal log
-#define GGMLQNN_PRINT_OP_ADD_LOG                0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
-#define GGMLQNN_PRINT_OP_MUL_MAT_LOG            1
+#define GGMLQNN_DEBUG                                   1  // for troubleshooting QNN backend
+#define ENABLE_QNNBACKEND_PERF                          0
+#define GGMLQNN_PRINT_OP_ADD_LOG                        0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
+#define GGMLQNN_PRINT_OP_MUL_MAT_LOG                    1
 #endif
-#define GGML_QNN_LOGBUF_LEN                     4096
+#define GGML_QNN_LOGBUF_LEN                             4096
 
-#define GGMLQNN_LOG_ERROR(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLQNN_LOG_WARN(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLQNN_LOG_INFO(...)  ggmlqnn_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_ERROR(...)                          ggmlqnn_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_WARN(...)                           ggmlqnn_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_INFO(...)                           ggmlqnn_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 
 #if GGMLQNN_DEBUG
-#define GGMLQNN_LOG_DEBUG(...) ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLQNN_LOG_DEBUG(...)                          ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #else
 #define GGMLQNN_LOG_DEBUG(...)
 #endif
-static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
-    static std::mutex ggmlqnn_log_internal_mutex;
-    static char s_ggmlqnn_log_internal_buf[GGML_QNN_LOGBUF_LEN];
 
-    GGML_UNUSED(file);
-#if !(defined __ANDROID__) || !(defined ANDROID)
-    GGML_UNUSED(level);
-#endif
-    {
-        std::lock_guard<std::mutex> lock(ggmlqnn_log_internal_mutex);
-        va_list args;
-        va_start(args, format);
-        int len_prefix = snprintf(s_ggmlqnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line);
-        int len = vsnprintf(s_ggmlqnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args);
-        if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) {
-#if (defined __ANDROID__) || (defined ANDROID)
-            //for Android application(standard APP or command line tool)
-            __android_log_print(ANDROID_LOG_INFO, "ggml-qnn", "%s\n", s_ggmlqnn_log_internal_buf);
-            if (GGML_LOG_LEVEL_INFO == level) {
-                printf("%s\n", s_ggmlqnn_log_internal_buf);
-            }
-#else
-            //for Snapdragon based WoA(Windows on ARM) device or Linux
-            printf("%s\n", s_ggmlqnn_log_internal_buf);
-#endif
-        }
-        va_end(args);
-    }
-}
+#define GGMLQNN_MEM_ADD(alignment)                      (sizeof (size_t) + alignment)
+#define GGMLQNN_MEM_MASK(alignment)                     ((uintptr_t)alignment - 1)
+#define GQCGT                                           ggmlqnn_create_general_tensor
+#define QNN_VER_PTR(x)                                  (&((x).v1))
+#define RPCMEM_DEFAULT_FLAGS                            1
+#define RPCMEM_HEAP_ID_SYSTEM                           25
 
-#if ENABLE_QNNBACKEND_PERF
-class qnn_perf {
-public:
-    qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
-    qnn_perf() = delete;
-    qnn_perf(const qnn_perf & ) = delete;
-    qnn_perf & operator= (const qnn_perf & ) = delete;
+#define QNN_TENSOR_GET_ID(tensor)                       get_qnn_tensorid(tensor)
+#define QNN_TENSOR_GET_NAME(tensor)                     get_qnn_tensorname(tensor)
+#define QNN_TENSOR_GET_TYPE(tensor)                     get_qnn_tensortype(tensor)
+#define QNN_TENSOR_GET_DATA_FORMAT(tensor)              get_qnn_tensor_dataformat(tensor)
+#define QNN_TENSOR_GET_DATA_TYPE(tensor)                get_qnn_tensor_datatype(tensor)
+#define QNN_TENSOR_GET_QUANT_PARAMS(tensor)             get_qnn_tensor_quantparams(tensor)
+#define QNN_TENSOR_GET_RANK(tensor)                     get_qnn_tensor_rank(tensor)
+#define QNN_TENSOR_GET_DIMENSIONS(tensor)               get_qnn_tensor_dimensions(tensor)
+#define QNN_TENSOR_GET_MEM_TYPE(tensor)                 get_qnn_tensor_memtype(tensor)
+#define QNN_TENSOR_GET_CLIENT_BUF(tensor)               get_qnn_tensor_clientbuf(tensor)
+#define QNN_TENSOR_GET_MEM_HANDLE(tensor)               get_qnn_tensor_memhandle(tensor)
 
-    void start() {
-        _begin_time = ggml_time_us();
-    }
+#define QNN_TENSOR_SET_ID(tensor, value)                set_qnn_tensor_id(tensor, value)
+#define QNN_TENSOR_SET_NAME(tensor, value)              set_qnn_tensor_name(tensor, value)
+#define QNN_TENSOR_SET_TYPE(tensor, value)              set_qnn_tensor_type(tensor, value)
+#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value)       set_qnn_tensor_dataformat(tensor, value)
+#define QNN_TENSOR_SET_DATA_TYPE(tensor, value)         set_qnn_tensor_datatype(tensor, value)
+#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value)      set_qnn_tensor_quantparams(tensor, value)
+#define QNN_TENSOR_SET_RANK(tensor, value)              set_qnn_tensor_rank(tensor, value)
+#define QNN_TENSOR_SET_DIMENSIONS(tensor, value)        set_qnn_tensor_dimensions(tensor, value)
+#define QNN_TENSOR_SET_MEM_TYPE(tensor, value)          set_qnn_tensor_memtype(tensor, value)
+#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value)        set_qnn_tensor_clientbuf(tensor, value)
+#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value)        set_qnn_tensor_memhandle(tensor, value)
 
-    void info() {
-        _end_time = ggml_time_us();
-        _duration = (_end_time - _begin_time);
-        GGMLQNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
-    }
+#define DISABLE_COPY(class_name)                                                \
+    class_name(const class_name &) = delete;                                    \
+    void operator=(const class_name &) = delete
 
-private:
-    int64_t _begin_time = 0LL;
-    int64_t _end_time   = 0LL;
-    int64_t _duration   = 0LL;
-    std::string _perf_name;
-};
-#else
-class qnn_perf {
-public:
-    qnn_perf(const std::string & perf_name) {
-        GGML_UNUSED(perf_name);
-    }
-    qnn_perf() = delete;
-    qnn_perf(const qnn_perf & ) = delete;
-    qnn_perf & operator= (const qnn_perf & ) = delete;
+#define DISABLE_MOVE(class_name)                                                \
+    class_name(class_name &&) = delete;                                         \
+    void operator=(class_name &&) = delete
 
-    void start() {}
-    void info() {}
-};
-#endif
+#define CHECK_QNN_API(error, result)                                            \
+    do {                                                                        \
+        error = (result);                                                       \
+        if (QNN_SUCCESS != error) {                                             \
+            if (error == QNN_COMMON_ERROR_NOT_SUPPORTED) {                      \
+                GGMLQNN_LOG_WARN("WARNING: QNN feature/API not supported\n");   \
+            } else {                                                            \
+                GGMLQNN_LOG_INFO("QNN API error = %d(%s)\n", error, ggmlqnn_get_qnnerror_string(error));  \
+            }                                                                   \
+        }                                                                       \
+    } while (0)
 
-// =================================================================================================
-//  section-3: general helper macro / data structure / function
-// =================================================================================================
-#define DISABLE_COPY(class_name)                \
-    class_name(const class_name &) = delete;    \
-    void operator=(const class_name &) = delete
+#define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                              \
+    do {                                                                        \
+        if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {           \
+            return;                                                             \
+        }                                                                       \
+    } while (0)
 
-#define DISABLE_MOVE(class_name)                \
-    class_name(class_name &&) = delete;         \
-    void operator=(class_name &&) = delete
+using pfn_rpc_mem_init                          = void (*)(void);
+using pfn_rpc_mem_deinit                        = void (*)(void);
+using pfn_rpc_mem_alloc                         = void *(*)(int, uint32_t, int);
+using pfn_rpc_mem_free                          = void (*)(void *);
+using pfn_rpc_mem_to_fd                         = int (*)(void *);
+using _pfn_QnnSaver_initialize                  = decltype(QnnSaver_initialize);
+using _pfn_QnnInterface_getProviders            = decltype(QnnInterface_getProviders);
+using _pfn_QnnSystemInterface_getProviders      = decltype(QnnSystemInterface_getProviders);
+using qnn_res_t                                 = std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>;
+using qnn_tensors_t                             = std::vector< Qnn_Tensor_t *>;
+
+enum class ggml_qnn_profile_level {
+    profile_off     = 0,
+    profile_basic   = 1,
+    profile_detail  = 2
+};
 
-#define GQCGT                                   ggmlqnn_create_general_tensor
+enum qcom_htp_arch {
+    NONE = 0,
+    V68 = 68,
+    V69 = 69,
+    V73 = 73,
+    V75 = 75,
+    V79 = 79,
+};
 
-//#if defined(_WIN32)
+enum qcom_chipset_soc_model {
+    UNKNOWN_SM = 0,
+    SM7450 = 41,  // v69, 7 Gen1
+    SM8350 = 30,  // v68, 888
+    SM8450 = 36,  // v69, SD 8 Gen 1
+    SM8475 = 42,  // v69, SD 8+ Gen 1
+    SM8550 = 43,  // v73, SD 8 Gen 2
+    SM8650 = 57,  // v75, SD 8 Gen 3
+    SM8750 = 69,  // v79, SD 8 Gen 4
 #if !defined(__ANDROID__) && !defined(__linux__)
-#define RTLD_GLOBAL 0x100
-#define RTLD_LOCAL  0x000
-#define RTLD_LAZY   0x000
-#define RTLD_NOW    0x001
-static void *              dlopen(const char * filename, int flag);
-static int                 dlclose(void * handle);
-static void *              dlsym(void* handle, const char* name);
-static const char *        dlerror(void);
+    SC7280X     = 44,
+    SC8280X     = 37,
+    SC8380XP    = 60,
+#endif
+};
 
-static const char * last_func = nullptr;
-static long last_err;
-static void * dlopen(const char * dll, int flags) {
-  HINSTANCE h = LoadLibraryA(dll);
-  GGML_UNUSED(flags);
-  if (h == NULL) {
-    last_err  = GetLastError();
-    last_func = "dlopen";
-  }
-  return h;
-}
+struct qcom_socinfo {
+    uint32_t soc_model;
+    size_t htp_arch;
+    size_t vtcm_size_in_mb;
+    char soc_desc[GGML_MAX_NAME];
+};
 
-static int dlclose(void * h) {
-  if (!FreeLibrary((HINSTANCE)h)) {
-    last_err  = GetLastError();
-    last_func = "dlclose";
-    return -1;
-  }
-  return 0;
-}
+struct ggml_backend_qnn_context {
+    int device;
+    int threads;
+    char name[GGML_MAX_NAME];
+    char desc[GGML_MAX_NAME];
+    char lib[GGML_MAX_NAME];
+    qnn_instance * instance;
+    struct ggml_backend * backend;
+    QNN_INTERFACE_VER_TYPE raw_interface;
+    QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
+    struct qcom_socinfo           socinfo;
 
-static void * dlsym(void * h, const char * name) {
-  FARPROC p = GetProcAddress((HINSTANCE)h, name);
-  if (!p) {
-    last_err  = GetLastError();
-    last_func = "dlsym";
-  }
-  return (void*)(intptr_t)p;
-}
+    std::unique_ptr<char[]> work_data;
+    std::vector<std::future<void>> tasks;
+    size_t work_size    = 0;
+    size_t desired_size = 0;
+    int n_threads       = GGML_DEFAULT_N_THREADS;
+};
 
-static const char * dlerror(void) {
-  static char str[512];
-  if (!last_err) return nullptr;
+struct qnn_op_caps_t {
+    const char * qnn_op_name        = nullptr;
+    const size_t input_param_count  = 0;
+    const char * qnn_param_name     = nullptr;
+};
 
-  snprintf(str, 512, "%s error #%ld", last_func, last_err);
-  last_err  = 0;
-  last_func = NULL;
+//file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
+static struct qcom_socinfo g_qnn_soc_info_table[] = {
+        /* Qualcomm SnapDragon 7 Gen 1 */
+        {
+                .soc_model         = SM7450,
+                .htp_arch          = V69,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 7 Gen 1"},
 
-  return str;
-}
-#endif
+        /* Qualcomm SnapDragon 888 */
+        {
+                .soc_model         = SM8350,
+                .htp_arch          = V68,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 888 "},
 
-#define GGMLQNN_MEM_ADD(alignment)              (sizeof (size_t) + alignment)
-#define GGMLQNN_MEM_MASK(alignment)             ((uintptr_t)alignment - 1)
-static void * ggmlqnn_mallocz_aligned(size_t size, size_t alignment) {
-    uint8_t * buffer    = NULL;
-    size_t * sp         = NULL;
-    buffer = static_cast<uint8_t *>(calloc(1, size + GGMLQNN_MEM_ADD(alignment)));
-    if (!buffer)
-        return NULL;
-    sp = (size_t *)buffer;
-    *sp = size;
-    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
-    buffer[-1] = buffer - (uint8_t *)sp;
-    return buffer;
-}
+        /* Qualcomm SnapDragon 8 Gen 1 */
+        {
+                .soc_model         = SM8450,
+                .htp_arch          = V69,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 1"},
 
-static void * ggmlqnn_malloc_aligned(size_t size, size_t alignment) {
-    uint8_t * buffer = NULL;
-    size_t * sp = NULL;
-    buffer = static_cast<uint8_t *>(malloc(size + GGMLQNN_MEM_ADD(alignment)));
-    if (!buffer)
-        return NULL;
-    sp = (size_t *)buffer;
-    *sp = size;
-    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
-    buffer[-1] = buffer - (uint8_t *)sp;
-    return buffer;
-}
+        /* Qualcomm SnapDragon 8 Gen 1+ */
+        {
+                .soc_model         = SM8475,
+                .htp_arch          = V69,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 1+"},
 
-static void ggmqnn_free_aligned(void * ptr) {
-    uint8_t * old = (uint8_t *)ptr;
-    if (!old)
-        return;
-    old -= old[-1];
-    free(old);
-}
+        /* Qualcomm SnapDragon 8 Gen 2 */
+        {
+                .soc_model         = SM8550,
+                .htp_arch          = V73,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 2"},
 
-static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
-    return offset % alignment == 0 ? offset
-                                   : offset +
-                                     (static_cast<intptr_t>(alignment) -
-                                      offset % static_cast<intptr_t>(alignment));
-}
+        /* Qualcomm SnapDragon 8 Gen 3 */
+        {
+                .soc_model         = SM8650,
+                .htp_arch          = V75,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 3 "},
 
-static size_t get_system_total_memory_in_bytes() {
-#if defined(__ANDROID__) || defined(__linux__)
-    struct sysinfo info = {};
-    if (0 == sysinfo(&info)) {
-        return (info.totalram + info.totalswap) * info.mem_unit;
-    }
-    size_t pages      = (size_t)sysconf(_SC_PHYS_PAGES);
-    size_t page_size  = (size_t)sysconf(_SC_PAGE_SIZE);
+        /* Qualcomm SnapDragon 8 Gen 4 */
+        {
+                .soc_model         = SM8750,
+                .htp_arch          = V79,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8 Gen 4"},
 
-    return pages * page_size;
-#else
-    //FIXME: Snapdragon based WoA(Windows on ARM)
-    MEMORYSTATUSEX statex;
-    statex.dwLength = sizeof(statex);
-    if (GlobalMemoryStatusEx(&statex)) {
-        GGMLQNN_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20);
-        GGMLQNN_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20);
-        return statex.ullTotalPhys;
-    }
-    return 0;
-#endif
-}
+#if !defined(__ANDROID__) && !defined(__linux__)
+        /* Qualcomm SnapDragon 7c Gen 2 */
+        {
+                .soc_model         = SC7280X,
+                .htp_arch          = V68,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 7c Gen 2"},
 
-static size_t get_system_free_memory_in_bytes() {
-#if defined(__ANDROID__) || defined(__linux__)
-    struct sysinfo info = {};
-    if (0 == sysinfo(&info)) {
-        return (info.freeram + info.freeswap) * info.mem_unit;
-    }
-    size_t avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES);
-    size_t page_size   = (size_t)sysconf(_SC_PAGE_SIZE);
+        /* Qualcomm SnapDragon 8cx Gen 3 */
+        {
+                .soc_model         = SC8280X,
+                .htp_arch          = V68,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8cx Gen 3"},
 
-    return avail_pages * page_size;
-#else
-    //FIXME: Snapdragon based WoA(Windows on ARM)
-    MEMORYSTATUSEX statex;
-    statex.dwLength = sizeof(statex);
-    if (GlobalMemoryStatusEx(&statex)) {
-        GGMLQNN_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20);
-        GGMLQNN_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20);
-        return statex.ullAvailPhys;
-    }
-    return 0;
+        /* Qualcomm SnapDragon 8cx Gen 4 */
+        {
+                .soc_model         = SC8380XP,
+                .htp_arch          = V73,
+                .vtcm_size_in_mb   = 8,
+                .soc_desc          = "Qualcomm SnapDragon 8cx Gen 4"},
 #endif
-}
-
-static size_t ggmlqnn_memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) {
-    if (!dst || !src || !dst_size || !copy_size)
-        return 0;
-
-    size_t min_size = dst_size < copy_size ? dst_size : copy_size;
 
-    memcpy(dst, src, min_size);
-
-    return min_size;
-}
+};
 
-static char * ggmlqnn_strndup(const char * source, size_t maxlen) {
-#if defined(__ANDROID__) || defined(__linux__)
-    return strndup(source, maxlen);
+// file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/quantization.html
+// CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend
+// GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend
+// HTP - Choose a quantized model. Quantized models are required when running on the HTP backend
+// DSP - Choose a quantized model. Quantized models are required when running on the DSP backend
+// HTA - Choose a quantized model. Quantized models are required when running on the HTA backend
+static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
+        [QNN_BACKEND_CPU] = {.device               = 0,
+                .threads              = 1,
+                .name                 = "qnn-cpu",
+                .desc                 = "Qualcomm Kryo CPU",
+#if !defined(__ANDROID__) && !defined(__linux__)
+                .lib                  = "QnnCpu.dll",
 #else
-    //FIXME:behaviour is not exactly same to Android&Linux
-    GGML_UNUSED(maxlen);
-    return strdup(source);
+                .lib                  = "libQnnCpu.so",
 #endif
-}
+                .instance             = nullptr,
+                .backend              = nullptr,
+                .raw_interface        = {},
+                .raw_system_interface = {},
+                .socinfo              = {}},
 
-static void * ggmlqnn_host_malloc(size_t buffer_size, size_t page_size) {
-    void * data = nullptr;
-#if defined(__ANDROID__) || defined(__linux__)
-    int result = posix_memalign((void **)&data, page_size, buffer_size);
-    if (result != 0) {
-        GGMLQNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__);
-        return nullptr;
-    }
+        [QNN_BACKEND_GPU] = {.device               = 1,
+                .threads              = 1,
+                .name                 = "qnn-gpu",
+                .desc                 = "Qualcomm Adreno GPU",
+#if !defined(__ANDROID__) && !defined(__linux__)
+                .lib                  = "QnnGpu.dll",
 #else
-    //GGMLQNN_LOG_DEBUG("buffer_size %d, page_size %d\n", buffer_size, page_size);
-    data = ggmlqnn_malloc_aligned(buffer_size, page_size);
-    if (nullptr == data) {
-        GGMLQNN_LOG_WARN("%s: error: host_malloc failed\n", __func__);
-    }
+                .lib                  = "libQnnGpu.so",
 #endif
+                .instance             = nullptr,
+                .backend              = nullptr,
+                .raw_interface        = {},
+                .raw_system_interface = {},
+                .socinfo              = {}},
 
-    return data;
-}
-
-// =================================================================================================
-//  section-4: QNN helper macro / data structure / function
-// =================================================================================================
-#define CHECK_QNN_API(error, result)                                            \
-    do {                                                                        \
-        error = (result);                                                       \
-        if (QNN_SUCCESS != error) {                                             \
-            if (error == QNN_COMMON_ERROR_NOT_SUPPORTED) {                      \
-                GGMLQNN_LOG_WARN("WARNING: QNN feature/API not supported\n");   \
-            } else {                                                            \
-                GGMLQNN_LOG_INFO("QNN API error = %d(%s)\n", error, ggmlqnn_get_error_string(error));  \
-            }                                                                   \
-        }                                                                       \
-    } while (0)
-
-#define QNN_VER_PTR(x)                                  (&((x).v1))
-#define RPCMEM_DEFAULT_FLAGS                            1
-#define RPCMEM_HEAP_ID_SYSTEM                           25
-
-#define QNN_TENSOR_GET_ID(tensor)                       get_qnn_tensorid(tensor)
-#define QNN_TENSOR_GET_NAME(tensor)                     get_qnn_tensorname(tensor)
-#define QNN_TENSOR_GET_TYPE(tensor)                     get_qnn_tensortype(tensor)
-#define QNN_TENSOR_GET_DATA_FORMAT(tensor)              get_qnn_tensor_dataformat(tensor)
-#define QNN_TENSOR_GET_DATA_TYPE(tensor)                get_qnn_tensor_datatype(tensor)
-#define QNN_TENSOR_GET_QUANT_PARAMS(tensor)             get_qnn_tensor_quantparams(tensor)
-#define QNN_TENSOR_GET_RANK(tensor)                     get_qnn_tensor_rank(tensor)
-#define QNN_TENSOR_GET_DIMENSIONS(tensor)               get_qnn_tensor_dimensions(tensor)
-#define QNN_TENSOR_GET_MEM_TYPE(tensor)                 get_qnn_tensor_memtype(tensor)
-#define QNN_TENSOR_GET_CLIENT_BUF(tensor)               get_qnn_tensor_clientbuf(tensor)
-#define QNN_TENSOR_GET_MEM_HANDLE(tensor)               get_qnn_tensor_memhandle(tensor)
-
-#define QNN_TENSOR_SET_ID(tensor, value)                set_qnn_tensor_id(tensor, value)
-#define QNN_TENSOR_SET_NAME(tensor, value)              set_qnn_tensor_name(tensor, value)
-#define QNN_TENSOR_SET_TYPE(tensor, value)              set_qnn_tensor_type(tensor, value)
-#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value)       set_qnn_tensor_dataformat(tensor, value)
-#define QNN_TENSOR_SET_DATA_TYPE(tensor, value)         set_qnn_tensor_datatype(tensor, value)
-#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value)      set_qnn_tensor_quantparams(tensor, value)
-#define QNN_TENSOR_SET_RANK(tensor, value)              set_qnn_tensor_rank(tensor, value)
-#define QNN_TENSOR_SET_DIMENSIONS(tensor, value)        set_qnn_tensor_dimensions(tensor, value)
-#define QNN_TENSOR_SET_MEM_TYPE(tensor, value)          set_qnn_tensor_memtype(tensor, value)
-#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value)        set_qnn_tensor_clientbuf(tensor, value)
-#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value)        set_qnn_tensor_memhandle(tensor, value)
-
-static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.id;
-    }
-
-    return 0u;
-}
-
-static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.name;
-    }
-    return nullptr;
-}
+        [QNN_BACKEND_NPU] = {.device               = 2,
+                .threads              = 1,
+                .name                 = "qnn-npu",
+                .desc                 = "Qualcomm NPU(Hexagon Tensor Processor)",
+#if !defined(__ANDROID__) && !defined(__linux__)
+                .lib                  = "QnnHtp.dll",
+#else
+                .lib                  = "libQnnHtp.so",
+#endif
+                .instance             = nullptr,
+                .backend              = nullptr,
+                .raw_interface        = {},
+                .raw_system_interface = {},
+                .socinfo              = {}},
+};
 
-static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.type;
-    }
-    return QNN_TENSOR_TYPE_UNDEFINED;
-}
+static const qnn_op_caps_t ggmlqnn_k_op_caps[] = {
+        {}, // GGML_OP_NONE
+        {}, // GGML_OP_DUP
+        {
+                // GGML_OP_ADD
+                QNN_OP_ELEMENT_WISE_ADD,
+                2,
+        },
+        {}, // GGML_OP_ADD1
+        {}, // GGML_OP_ACC
+        {}, // GGML_OP_SUB
+        {
+                // GGML_OP_MUL
+                QNN_OP_ELEMENT_WISE_MULTIPLY,
+                2,
+        },
+        {}, // GGML_OP_DIV
+        {}, // GGML_OP_SQR
+        {}, // GGML_OP_SQRT
+        {}, // GGML_OP_LOG
+        {}, // GGML_OP_SIN
+        {}, // GGML_OP_COS
+        {}, // GGML_OP_SUM
+        {}, // GGML_OP_SUM_ROWS
+        {}, // GGML_OP_MEAN
+        {}, // GGML_OP_ARGMAX
+        {}, // GGML_OP_COUNT_EQUAL
+        {}, // GGML_OP_REPEAT
+        {}, // GGML_OP_REPEAT_BACK
+        {}, // GGML_OP_CONCAT
+        {}, // GGML_OP_SILU_BACK
+        {}, // GGML_OP_NORM
+        {}, // GGML_OP_RMS_NORM
+        {}, // GGML_OP_RMS_NORM_BACK
+        {}, // GGML_OP_GROUP_NORM
+        {
+                // GGML_OP_MUL_MAT
+                QNN_OP_MAT_MUL,
+                2,
+        },
+        {}, // GGML_OP_MUL_MAT_ID
+        {}, // GGML_OP_OUT_PROD
+        {}, // GGML_OP_SCALE
+        {}, // GGML_OP_SET
+        {}, // GGML_OP_CPY
+        {}, // GGML_OP_CONT
+        {}, // GGML_OP_RESHAPE
+        {}, // GGML_OP_VIEW
+        {}, // GGML_OP_PERMUTE
+        {}, // GGML_OP_TRANSPOSE
+        {}, // GGML_OP_GET_ROWS
+        {}, // GGML_OP_GET_ROWS_BACK
+        {}, // GGML_OP_DIAG
+        {}, // GGML_OP_DIAG_MASK_INF
+        {}, // GGML_OP_DIAG_MASK_ZERO
+        {}, // GGML_OP_SOFT_MAX
+        {}, // GGML_OP_SOFT_MAX_BACK
+        {}, // GGML_OP_ROPE
+        {}, // GGML_OP_ROPE_BACK
+        {}, // GGML_OP_CLAMP
+        {}, // GGML_OP_CONV_TRANSPOSE_1D
+        {}, // GGML_OP_IM2COL
+        {}, // GGML_OP_IM2COL_BACK
+        {}, // GGML_OP_CONV_TRANSPOSE_2D
+        {}, // GGML_OP_POOL_1D
+        {}, // GGML_OP_POOL_2D
+        {}, // GGML_OP_POOL_2D_BACK
+        {}, // GGML_OP_UPSCALE
+        {}, // GGML_OP_PAD
+        {}, // GGML_OP_PAD_REFLECT_1D
+        {}, // GGML_OP_ARANGE
+        {}, // GGML_OP_TIMESTEP_EMBEDDING
+        {}, // GGML_OP_ARGSORT
+        {}, // GGML_OP_LEAKY_RELU
+        {}, // GGML_OP_FLASH_ATTN_EXT
+        {}, // GGML_OP_FLASH_ATTN_BACK
+        {}, // GGML_OP_SSM_CONV
+        {}, // GGML_OP_SSM_SCAN
+        {}, // GGML_OP_WIN_PART
+        {}, // GGML_OP_WIN_UNPART
+        {}, // GGML_OP_GET_REL_POS
+        {}, // GGML_OP_ADD_REL_POS
+        {}, // GGML_OP_RWKV_WKV6
+        {}, // GGML_OP_GATED_LINEAR_ATTN
+        {}, // GGML_OP_UNARY
+        {}, // GGML_OP_MAP_UNARY
+        {}, // GGML_OP_MAP_BINARY
+        {}, // GGML_OP_MAP_CUSTOM1_F32
+        {}, // GGML_OP_MAP_CUSTOM2_F32
+        {}, // GGML_OP_MAP_CUSTOM3_F32
+        {}, // GGML_OP_MAP_CUSTOM1
+        {}, // GGML_OP_MAP_CUSTOM2
+        {}, // GGML_OP_MAP_CUSTOM3
+        {}, // GGML_OP_CROSS_ENTROPY_LOSS
+        {}, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
+        {}, // GGML_OP_OPT_STEP_ADAMW
+        {}, // GGML_UNARY_OP_ABS
+        {}, // GGML_UNARY_OP_SGN
+        {}, // GGML_UNARY_OP_NEG
+        {}, // GGML_UNARY_OP_STEP
+        {}, // GGML_UNARY_OP_TANH
+        {}, // GGML_UNARY_OP_ELU
+        {}, // GGML_UNARY_OP_RELU
+        {}, // GGML_UNARY_OP_SIGMOID
+        {}, // GGML_UNARY_OP_GELU
+        {}, // GGML_UNARY_OP_GELU_QUICK
+        {}, // GGML_UNARY_OP_SILU
+        {}, // GGML_UNARY_OP_HARDSWISH
+        {}, // GGML_UNARY_OP_HARDSIGMOID
+        {}, // GGML_UNARY_OP_EXP
+};
 
-static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.dataFormat;
-    }
-    return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER;
-}
+// =================================================================================================
+//  section-3: ggml-qnn internal troubleshooting function/class
+// =================================================================================================
+static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
+    static std::mutex ggmlqnn_log_internal_mutex;
+    static char s_ggmlqnn_log_internal_buf[GGML_QNN_LOGBUF_LEN];
 
-static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.dataType;
+    GGML_UNUSED(file);
+#if !(defined __ANDROID__) || !(defined ANDROID)
+    GGML_UNUSED(level);
+#endif
+    {
+        std::lock_guard<std::mutex> lock(ggmlqnn_log_internal_mutex);
+        va_list args;
+        va_start(args, format);
+        int len_prefix = snprintf(s_ggmlqnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line);
+        int len = vsnprintf(s_ggmlqnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args);
+        if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) {
+#if (defined __ANDROID__) || (defined ANDROID)
+            __android_log_print(ANDROID_LOG_INFO, "ggml-qnn", "%s\n", s_ggmlqnn_log_internal_buf);
+            if (GGML_LOG_LEVEL_INFO == level) {
+                printf("%s\n", s_ggmlqnn_log_internal_buf);
+            }
+#else
+            //for Snapdragon based WoA(Windows on ARM) device or Linux
+            printf("%s\n", s_ggmlqnn_log_internal_buf);
+#endif
+        }
+        va_end(args);
     }
-    return QNN_DATATYPE_UNDEFINED;
 }
 
-static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.quantizeParams;
-    }
-    return QNN_QUANTIZE_PARAMS_INIT;
-}
+#if ENABLE_QNNBACKEND_PERF
+class qnn_perf {
+public:
+    qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
+    qnn_perf() = delete;
+    qnn_perf(const qnn_perf & ) = delete;
+    qnn_perf & operator= (const qnn_perf & ) = delete;
 
-static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.rank;
+    void start() {
+        _begin_time = ggml_time_us();
     }
-    return 0u;
-}
 
-static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.dimensions;
+    void info() {
+        _end_time = ggml_time_us();
+        _duration = (_end_time - _begin_time);
+        GGMLQNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
     }
-    return nullptr;
-}
 
-static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        return tensor.v1.memType;
+private:
+    int64_t _begin_time = 0LL;
+    int64_t _end_time   = 0LL;
+    int64_t _duration   = 0LL;
+    std::string _perf_name;
+};
+#else
+class qnn_perf {
+public:
+    qnn_perf(const std::string & perf_name) {
+        GGML_UNUSED(perf_name);
     }
-    return QNN_TENSORMEMTYPE_UNDEFINED;
-}
+    qnn_perf() = delete;
+    qnn_perf(const qnn_perf & ) = delete;
+    qnn_perf & operator= (const qnn_perf & ) = delete;
 
-static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.id = id;
+    void start() {}
+    void info() {}
+};
+#endif
+
+class qnn_cfg {
+public:
+    void dump(std::function<void(const std::string &, const std::string &, const std::string &)> worker) {
+        if (!_load_success) {
+            GGMLQNN_LOG_INFO("qnn cfg file %s not loadded", _cfg_filename.c_str());
+            return;
+        }
+        auto iter = _qnn_cfg.begin();
+        while (iter != _qnn_cfg.end()) {
+            auto kv_iter = iter->second.begin();
+            while (kv_iter != iter->second.end()) {
+                worker(iter->first, kv_iter->first, kv_iter->second);
+                ++kv_iter;
+            }
+            ++iter;
+        }
     }
-}
 
-static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.name = name;
+    bool load(const std::string & file_name) {
+        if (file_name == "") {
+            return false;
+        }
+        _cfg_filename = file_name;
+        std::ifstream in;
+        std::string line;
+        in.open(file_name.c_str());
+        if (not in.is_open()) {
+            GGMLQNN_LOG_WARN("can't open file %s", file_name.c_str());
+            return false;
+        }
+        while (getline(in, line)) {
+            std::string section, key, value;
+            if (not parse_line(line, section, key, value)) {
+                continue;
+            }
+            set_section_keyvalue(section, key, value);
+        }
+        _load_success = true;
+        return true;
     }
-}
 
-static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.type = type;
+    void get_stringvalue(const std::string & section, const std::string & key, std::string & value, std::string default_value) {
+        value = default_value;
+        if (_qnn_cfg.find(section) == _qnn_cfg.end()) {
+            return;
+        }
+        if (_qnn_cfg[section].find(key) == _qnn_cfg[section].end()) {
+            return;
+        }
+        value = _qnn_cfg[section][key];
     }
-}
 
-static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.dataFormat = format;
+    void get_intvalue(const std::string & section, const std::string & key, int & value, int default_value) {
+        value = default_value;
+        if (_qnn_cfg.find(section) == _qnn_cfg.end()) {
+            return;
+        }
+        if (_qnn_cfg[section].find(key) == _qnn_cfg[section].end()) {
+            return;
+        }
+        value = atol(_qnn_cfg[section][key].c_str());
     }
-}
 
-static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.dataType = dataType;
+private:
+    void ltrim(std::string & str) {
+        if (str.empty()) return;
+        size_t len = 0;
+        char* temp = (char*)str.c_str();
+        while (*temp && isblank(*temp)) {
+            ++len;
+            ++temp;
+        }
+        if (len > 0) str.erase(0, len);
     }
-}
 
-static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.quantizeParams = params;
+    void rtrim(std::string & str) {
+        if (str.empty()) return;
+        size_t len = str.length();
+        size_t pos = len;
+        while (pos > 0) {
+            if (not isblank(str[pos - 1])) {
+                break;
+            }
+            --pos;
+        }
+        if (pos != len) str.erase(pos);
     }
-}
 
-static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.rank = rank;
+    void trim(std::string& str) {
+        ltrim(str);
+        rtrim(str);
     }
-}
 
-static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.dimensions = dims;
+    void set_section_keyvalue(std::string & section, std::string & key, std::string & value) {
+        if (_qnn_cfg.find(section) == _qnn_cfg.end()) {
+            std::unordered_map<std::string, std::string> kv_map;
+            _qnn_cfg[section] = kv_map;
+        }
+        if (key != "" && value != "") _qnn_cfg[section][key] = value;
     }
-}
 
-static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.memType = memType;
+    bool parse_line(std::string & line, std::string & section, std::string & key, std::string & value) {
+        static std::string cur_section = "";
+        std::string nodes[2] = {"#", ";"};
+        for (int i = 0; i < 2; ++i) {
+            std::string::size_type pos = line.find(nodes[i]);
+            if (pos != std::string::npos) line.erase(pos);
+        }
+        trim(line);
+        if (line == "") return false;
+        if (line[0] == '[' && line[line.size() - 1] == ']') {
+            section = line.substr(1, line.size() - 2);
+            trim(section);
+            cur_section = section;
+            return false;
+        }
+        if (cur_section == "") return false;
+        bool is_key = true;
+        for (size_t i = 0; i < line.size(); ++i) {
+            if (line[i] == '=') {
+                is_key = false;
+                continue;
+            }
+            if (is_key) {
+                key += line[i];
+            } else {
+                value += line[i];
+            }
+        }
+        section = cur_section;
+        trim(key);
+        trim(value);
+        return true;
     }
+private:
+    std::unordered_map<std::string, std::unordered_map<std::string, std::string>> _qnn_cfg;
+    bool _load_success = false;
+    std::string _cfg_filename;
+};
+
+// =================================================================================================
+//  section-4: helper function for WoA(Window on ARM)
+// =================================================================================================
+#if !defined(__ANDROID__) && !defined(__linux__)
+#define RTLD_GLOBAL 0x100
+#define RTLD_LOCAL  0x000
+#define RTLD_LAZY   0x000
+#define RTLD_NOW    0x001
+static void *       dlopen(const char * filename, int flag);
+static int          dlclose(void * handle);
+static void *       dlsym(void* handle, const char* name);
+static const char * dlerror(void);
+
+static const char * last_func = nullptr;
+static long last_err;
+static void * dlopen(const char * dll, int flags) {
+  HINSTANCE h = LoadLibraryA(dll);
+  GGML_UNUSED(flags);
+  if (h == NULL) {
+    last_err  = GetLastError();
+    last_func = "dlopen";
+  }
+  return h;
 }
 
-static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.clientBuf = clientBuf;
-    }
+static int dlclose(void * h) {
+  if (!FreeLibrary((HINSTANCE)h)) {
+    last_err  = GetLastError();
+    last_func = "dlclose";
+    return -1;
+  }
+  return 0;
 }
 
-static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) {
-    if (tensor.version == QNN_TENSOR_VERSION_1) {
-        tensor.v1.memHandle = handle;
-    }
+static void * dlsym(void * h, const char * name) {
+  FARPROC p = GetProcAddress((HINSTANCE)h, name);
+  if (!p) {
+    last_err  = GetLastError();
+    last_func = "dlsym";
+  }
+  return (void*)(intptr_t)p;
 }
 
-static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
-    int err = 0;
+static const char * dlerror(void) {
+  static char str[512];
+  if (!last_err) return nullptr;
 
-    dst.version = src.version;
-    QNN_TENSOR_SET_NAME(dst, ggmlqnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size()));
-    if (nullptr == QNN_TENSOR_GET_NAME(dst)) {
-        return 1;
-    }
-    QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src));
-    QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src));
-    QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src));
-    QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src));
-    QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src));
+  snprintf(str, 512, "%s error #%ld", last_func, last_err);
+  last_err  = 0;
+  last_func = NULL;
 
-    if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) {
-        Qnn_ClientBuffer_t client_buf = {nullptr, 0};
-        QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf);
-    } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) {
-        QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr);
-    } else {
-        return 1;
-    }
+  return str;
+}
+#endif
 
-    Qnn_QuantizeParams_t src_qparam      = QNN_TENSOR_GET_QUANT_PARAMS(src);
-    Qnn_QuantizationEncoding_t encoding  = src_qparam.quantizationEncoding;
-    if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
-        Qnn_QuantizeParams_t src_qparam_cpy       = src_qparam;
-        Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding;
-        Qnn_ScaleOffset_t ** scale_offset         = &axis_scale_offset.scaleOffset;
-        size_t scale_offset_size = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t);
-        *scale_offset            = (Qnn_ScaleOffset_t *)malloc(scale_offset_size);
-        ggmlqnn_memscpy(*scale_offset,
-                        scale_offset_size,
-                        src_qparam.axisScaleOffsetEncoding.scaleOffset,
-                        scale_offset_size);
-        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy);
-    } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
-        Qnn_QuantizeParams_t src_qparam_cpy           = src_qparam;
-        Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding;
-        size_t scale_size                          = bwaxis_scale_offset.numElements * sizeof(float);
-        float ** scales                            = &bwaxis_scale_offset.scales;
-        int32_t ** offsets                         = &bwaxis_scale_offset.offsets;
-        *scales                                    = (float *)malloc(scale_size);
-        ggmlqnn_memscpy(*scales, scale_size, src_qparam.bwAxisScaleOffsetEncoding.scales, scale_size);
+// =================================================================================================
+//  section-5: general helper function
+// =================================================================================================
+//the following 3 helper funcs are used to ensure every QNN tensor name is unique
+static void ggmqnn_reset_tensoridx() {
+    g_ggmltensor_idx = 0;
+}
 
-        if (bwaxis_scale_offset.offsets != nullptr) {
-            size_t offset_size = bwaxis_scale_offset.numElements * sizeof(int32_t);
-            *offsets           = (int32_t *)malloc(offset_size);
-            ggmlqnn_memscpy(*offsets, offset_size, src_qparam.bwAxisScaleOffsetEncoding.offsets, offset_size);
-        }
-        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy);
-    } else {
-        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam);
-    }
+static void ggmqnn_inc_tensoridx() {
+    g_ggmltensor_idx++;
+}
 
-    uint32_t rank = QNN_TENSOR_GET_RANK(src);
-    QNN_TENSOR_SET_RANK(dst, rank);
-    size_t dim_size       = GGML_MAX_DIMS * sizeof(uint32_t);
-    uint32_t * dimensions = (uint32_t *)malloc(dim_size);
-    if (nullptr == dimensions) {
-        GGMLQNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src));
-        return 1;
+static int32_t ggmqnn_get_tensoridx() {
+#if !defined(__ANDROID__) && !defined(__linux__)
+    return g_ggmltensor_idx.load();
+#else
+    return g_ggmltensor_idx;
+#endif
+}
+
+static void * ggmlqnn_mallocz_aligned(size_t size, size_t alignment) {
+    uint8_t * buffer    = NULL;
+    size_t * sp         = NULL;
+    buffer = static_cast<uint8_t *>(calloc(1, size + GGMLQNN_MEM_ADD(alignment)));
+    if (!buffer)
+        return NULL;
+    sp = (size_t *)buffer;
+    *sp = size;
+    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
+    buffer[-1] = buffer - (uint8_t *)sp;
+    return buffer;
+}
+
+static void * ggmlqnn_malloc_aligned(size_t size, size_t alignment) {
+    uint8_t * buffer = NULL;
+    size_t * sp = NULL;
+    buffer = static_cast<uint8_t *>(malloc(size + GGMLQNN_MEM_ADD(alignment)));
+    if (!buffer)
+        return NULL;
+    sp = (size_t *)buffer;
+    *sp = size;
+    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
+    buffer[-1] = buffer - (uint8_t *)sp;
+    return buffer;
+}
+
+static void ggmqnn_free_aligned(void * ptr) {
+    uint8_t * old = (uint8_t *)ptr;
+    if (!old)
+        return;
+    old -= old[-1];
+    free(old);
+}
+
+static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
+    return offset % alignment == 0 ? offset
+                                   : offset +
+                                     (static_cast<intptr_t>(alignment) -
+                                      offset % static_cast<intptr_t>(alignment));
+}
+
+static size_t ggmlqnn_get_system_total_memory_in_bytes() {
+#if defined(__ANDROID__) || defined(__linux__)
+    struct sysinfo info = {};
+    if (0 == sysinfo(&info)) {
+        return (info.totalram + info.totalswap) * info.mem_unit;
     }
-    ggmlqnn_memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size);
-    QNN_TENSOR_SET_DIMENSIONS(dst, dimensions);
+    size_t pages      = (size_t)sysconf(_SC_PHYS_PAGES);
+    size_t page_size  = (size_t)sysconf(_SC_PAGE_SIZE);
 
-    return err;
+    return pages * page_size;
+#else
+    //FIXME: Snapdragon based WoA(Windows on ARM)
+    MEMORYSTATUSEX statex;
+    statex.dwLength = sizeof(statex);
+    if (GlobalMemoryStatusEx(&statex)) {
+        GGMLQNN_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20);
+        GGMLQNN_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20);
+        return statex.ullTotalPhys;
+    }
+    return 0;
+#endif
 }
 
-static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
-    int err = 0;
-    free((void *) QNN_TENSOR_GET_NAME(*tensor));
-    Qnn_QuantizeParams_t src_qparam     = QNN_TENSOR_GET_QUANT_PARAMS(*tensor);
-    Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding;
-    if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
-        free(src_qparam.axisScaleOffsetEncoding.scaleOffset);
-    } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
-        free(src_qparam.bwAxisScaleOffsetEncoding.scales);
-        if (src_qparam.bwAxisScaleOffsetEncoding.offsets != nullptr) {
-            free(src_qparam.bwAxisScaleOffsetEncoding.offsets);
-        }
+static size_t ggmlqnn_get_system_free_memory_in_bytes() {
+#if defined(__ANDROID__) || defined(__linux__)
+    struct sysinfo info = {};
+    if (0 == sysinfo(&info)) {
+        return (info.freeram + info.freeswap) * info.mem_unit;
     }
-    free(QNN_TENSOR_GET_DIMENSIONS(*tensor));
-    free(tensor);
+    size_t avail_pages = (size_t)sysconf(_SC_AVPHYS_PAGES);
+    size_t page_size   = (size_t)sysconf(_SC_PAGE_SIZE);
 
-    return err;
+    return avail_pages * page_size;
+#else
+    //FIXME: Snapdragon based WoA(Windows on ARM)
+    MEMORYSTATUSEX statex;
+    statex.dwLength = sizeof(statex);
+    if (GlobalMemoryStatusEx(&statex)) {
+        GGMLQNN_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20);
+        GGMLQNN_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20);
+        return statex.ullAvailPhys;
+    }
+    return 0;
+#endif
 }
 
-static const char * ggmlqnn_get_error_string(Qnn_ErrorHandle_t qnn_error_code) {
-    // file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/api_error_codes.html
-    switch (qnn_error_code) {
-        case QNN_SUCCESS:
-            return "QNN_SUCCESS";
-        case QNN_COMMON_ERROR_GENERAL:
-            return "QNN_COMMON_ERROR_GENERAL";
+static size_t ggmlqnn_memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) {
+    if (!dst || !src || !dst_size || !copy_size)
+        return 0;
 
-            // QnnGraph_Error_t
-        case QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE:
-            return "QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE";
-        case QNN_GRAPH_ERROR_MEM_ALLOC:
-            return "QNN_GRAPH_ERROR_MEM_ALLOC";
-        case QNN_GRAPH_ERROR_INVALID_ARGUMENT:
-            return "QNN_GRAPH_ERROR_INVALID_ARGUMENT";
-        case QNN_GRAPH_ERROR_INVALID_HANDLE:
-            return "QNN_GRAPH_ERROR_INVALID_HANDLE";
-        case QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST:
-            return "QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST";
-        case QNN_GRAPH_ERROR_INVALID_NAME:
-            return "QNN_GRAPH_ERROR_INVALID_NAME";
-        case QNN_GRAPH_ERROR_INVALID_TENSOR:
-            return "QNN_GRAPH_ERROR_INVALID_TENSOR";
-        case QNN_GRAPH_ERROR_INVALID_OP_CONFIG:
-            return "QNN_GRAPH_ERROR_INVALID_OP_CONFIG";
-        case QNN_GRAPH_ERROR_SET_PROFILE:
-            return "QNN_GRAPH_ERROR_SET_PROFILE";
-        case QNN_GRAPH_ERROR_UNCONNECTED_NODE:
-            return "QNN_GRAPH_ERROR_UNCONNECTED_NODE";
-        case QNN_GRAPH_ERROR_CREATE_FAILED:
-            return "QNN_GRAPH_ERROR_CREATE_FAILED";
-        case QNN_GRAPH_ERROR_OPTIMIZATION_FAILED:
-            return "QNN_GRAPH_ERROR_OPTIMIZATION_FAILED";
-        case QNN_GRAPH_ERROR_FINALIZE_FAILED:
-            return "QNN_GRAPH_ERROR_FINALIZE_FAILED";
-        case QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED:
-            return "QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED";
-        case QNN_GRAPH_ERROR_GRAPH_FINALIZED:
-            return "QNN_GRAPH_ERROR_GRAPH_FINALIZED";
-        case QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL:
-            return "QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL";
-        case QNN_GRAPH_ERROR_SIGNAL_IN_USE:
-            return "QNN_GRAPH_ERROR_SIGNAL_IN_USE";
-        case QNN_GRAPH_ERROR_ABORTED:
-            return "QNN_GRAPH_ERROR_ABORTED";
-        case QNN_GRAPH_ERROR_PROFILE_IN_USE:
-            return "QNN_GRAPH_ERROR_PROFILE_IN_USE";
-        case QNN_GRAPH_ERROR_TIMED_OUT:
-            return "QNN_GRAPH_ERROR_TIMED_OUT";
-        case QNN_GRAPH_ERROR_SUBGRAPH:
-            return "QNN_GRAPH_ERROR_SUBGRAPH";
-        case QNN_GRAPH_ERROR_DISABLED:
-            return "QNN_GRAPH_ERROR_DISABLED";
-        case QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE:
-            return "QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE";
-        case QNN_GRAPH_ERROR_TENSOR_SPARSITY:
-            return "QNN_GRAPH_ERROR_TENSOR_SPARSITY";
-        case QNN_GRAPH_ERROR_EARLY_TERMINATION:
-            return "QNN_GRAPH_ERROR_EARLY_TERMINATION";
-        case QNN_GRAPH_ERROR_INVALID_CONTEXT:
-            return "QNN_GRAPH_ERROR_INVALID_CONTEXT";
+    size_t min_size = dst_size < copy_size ? dst_size : copy_size;
 
-            //QQnnTensor_Error_t
-            //Invalid context/graph handle in creating tensor
-        case QNN_TENSOR_ERROR_INVALID_HANDLE:
-            return "QNN_TENSOR_ERROR_INVALID_HANDLE";
-            //Tensor with specified credentials not registered with a context/graph
-        case QNN_TENSOR_ERROR_DOES_NOT_EXIST:
-            return "QNN_TENSOR_ERROR_DOES_NOT_EXIST";
-            // (deprecated) Tensor has already been registered with backend
-        case QNN_TENSOR_ERROR_ALREADY_EXISTS:
-            return "QNN_TENSOR_ERROR_ALREADY_EXISTS";
-            // Invalid tensor param.
-        case QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM:
-            return "QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM";
-            // This tensor param is currently unsupported
-        case QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM:
-            return "QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM";
-            // Tensor provided for update is invalid
-        case QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE:
-            return "QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE";
+    memcpy(dst, src, min_size);
 
-            // QnnOpPackage_Error_t
-        case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED:
-            return "QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED";
-        case QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED:
-            return "QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED";
-        case QNN_OP_PACKAGE_ERROR_INVALID_HANDLE:
-            return "QNN_OP_PACKAGE_ERROR_INVALID_HANDLE";
-        case QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE:
-            return "QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE";
-        case QNN_OP_PACKAGE_ERROR_INVALID_INFO:
-            return "QNN_OP_PACKAGE_ERROR_INVALID_INFO";
-        case QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE:
-            return "QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE";
-        case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT:
-            return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT";
+    return min_size;
+}
 
-        default:
-            return "unknown QNN error";
-    }
+static char * ggmlqnn_strndup(const char * source, size_t maxlen) {
+#if defined(__ANDROID__) || defined(__linux__)
+    return strndup(source, maxlen);
+#else
+    //FIXME:behaviour is not exactly same to Android&Linux
+    GGML_UNUSED(maxlen);
+    return strdup(source);
+#endif
 }
 
-static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
-                                       Qnn_Param_t * params, uint32_t num_params,
-                                       Qnn_Tensor_t * inputs, uint32_t num_inputs,
-                                       Qnn_Tensor_t * outputs, uint32_t num_outputs) {
-    Qnn_OpConfigV1_t v1 = {name, package, type,
-                           num_params, params,
-                           num_inputs, inputs,
-                           num_outputs, outputs
-    };
-    Qnn_OpConfig_t opcfg = {QNN_OPCONFIG_VERSION_1, {v1}};
+static void * ggmlqnn_host_malloc(size_t buffer_size, size_t page_size) {
+    void * data = nullptr;
+#if defined(__ANDROID__) || defined(__linux__)
+    int result = posix_memalign((void **)&data, page_size, buffer_size);
+    if (result != 0) {
+        GGMLQNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__);
+        return nullptr;
+    }
+#else
+    //GGMLQNN_LOG_DEBUG("buffer_size %d, page_size %d\n", buffer_size, page_size);
+    data = ggmlqnn_malloc_aligned(buffer_size, page_size);
+    if (nullptr == data) {
+        GGMLQNN_LOG_WARN("%s: error: host_malloc failed\n", __func__);
+    }
+#endif
 
-    return opcfg;
+    return data;
+}
+
+static void ggmlqnn_load_cfg() {
+    std::string cfg_filename = std::string(g_qnn_runtimelib_path) + std::string(g_qnn_cfgfilename);
+    GGMLQNN_LOG_INFO("load ggml-qnn config from %s", cfg_filename.c_str());
+    qnn_cfg qnncfg_instance;
+    qnncfg_instance.load(cfg_filename);
+    qnncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
+        std::ostringstream  tmposs;
+        tmposs << "section[" << section << "],[" << key << "] = [" << value << "]" << std::endl;
+        GGMLQNN_LOG_INFO("%s", tmposs.str().c_str());
+    });
+    std::string npu_inference_datatype;
+    qnncfg_instance.get_intvalue("general", "print_qnn_internal_log", g_print_qnn_internal_log, 0);
+    qnncfg_instance.get_intvalue("general", "inference_approach", g_inference_approach, 0);
+    qnncfg_instance.get_stringvalue("npu", "npu_inference_datatype", npu_inference_datatype, "fp32");
+    GGMLQNN_LOG_INFO("print_qnn_internal_log=%d", g_print_qnn_internal_log);
+    GGMLQNN_LOG_INFO("inference_approach=%d", g_inference_approach);
+    GGMLQNN_LOG_INFO("npu inference data type=%s", npu_inference_datatype.c_str());
 }
 
 // =================================================================================================
-//  section-5:ggml-qnn backend helper macro / data structure / function / class
+//  section-6: QNN helper function
 // =================================================================================================
-using pfn_rpc_mem_init                          = void (*)(void);
-using pfn_rpc_mem_deinit                        = void (*)(void);
-using pfn_rpc_mem_alloc                         = void *(*)(int, uint32_t, int);
-using pfn_rpc_mem_free                          = void (*)(void *);
-using pfn_rpc_mem_to_fd                         = int (*)(void *);
-using _pfn_QnnSaver_initialize                  = decltype(QnnSaver_initialize);
-using _pfn_QnnInterface_getProviders            = decltype(QnnInterface_getProviders);
-using _pfn_QnnSystemInterface_getProviders      = decltype(QnnSystemInterface_getProviders);
+static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.id;
+    }
 
-using qnn_res_t                                 = std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>;
-using qnn_tensors_t                             = std::vector< Qnn_Tensor_t *>;
+    return 0u;
+}
 
-enum class ggml_qnn_profile_level {
-    profile_off     = 0,
-    profile_basic   = 1,
-    profile_detail  = 2
-};
+static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.name;
+    }
+    return nullptr;
+}
 
-enum qcom_htp_arch {
-    NONE = 0,
-    V68 = 68,
-    V69 = 69,
-    V73 = 73,
-    V75 = 75,
-    V79 = 79,
-};
+static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.type;
+    }
+    return QNN_TENSOR_TYPE_UNDEFINED;
+}
 
-enum qcom_chipset_soc_model {
-    UNKNOWN_SM = 0,
-    SM7450 = 41,  // v69, 7 Gen1
-    SM8350 = 30,  // v68, 888
-    SM8450 = 36,  // v69, SD 8 Gen 1
-    SM8475 = 42,  // v69, SD 8+ Gen 1
-    SM8550 = 43,  // v73, SD 8 Gen 2
-    SM8650 = 57,  // v75, SD 8 Gen 3
-    SM8750 = 69,  // v79, SD 8 Gen 4
-#if !defined(__ANDROID__) && !defined(__linux__)
-    SC7280X     = 44,
-    SC8280X     = 37,
-    SC8380XP    = 60,
-#endif
-};
+static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.dataFormat;
+    }
+    return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER;
+}
 
-struct qcom_socinfo {
-    uint32_t soc_model;
-    size_t htp_arch;
-    size_t vtcm_size_in_mb;
-    char soc_desc[GGML_MAX_NAME];
-};
+static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.dataType;
+    }
+    return QNN_DATATYPE_UNDEFINED;
+}
 
-struct ggml_backend_qnn_context {
-    int device;
-    int threads;
-    char name[GGML_MAX_NAME];
-    char desc[GGML_MAX_NAME];
-    char lib[GGML_MAX_NAME];
-    qnn_instance * instance;
-    struct ggml_backend * backend;
-    QNN_INTERFACE_VER_TYPE raw_interface;
-    QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
-    struct qcom_socinfo           socinfo;
+static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.quantizeParams;
+    }
+    return QNN_QUANTIZE_PARAMS_INIT;
+}
 
-    std::unique_ptr<char[]> work_data;
-    std::vector<std::future<void>> tasks;
-    size_t work_size    = 0;
-    size_t desired_size = 0;
-    int n_threads       = GGML_DEFAULT_N_THREADS;
-};
+static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.rank;
+    }
+    return 0u;
+}
 
-struct qnn_op_caps_t {
-    const char * qnn_op_name        = nullptr;
-    const size_t input_param_count  = 0;
-    const char * qnn_param_name     = nullptr;
-};
+static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.dimensions;
+    }
+    return nullptr;
+}
 
-//file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
-static struct qcom_socinfo g_qnn_soc_info_table[] = {
-        /* Qualcomm SnapDragon 7 Gen 1 */
-        {
-                .soc_model         = SM7450,
-                .htp_arch          = V69,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 7 Gen 1"},
+static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        return tensor.v1.memType;
+    }
+    return QNN_TENSORMEMTYPE_UNDEFINED;
+}
 
-        /* Qualcomm SnapDragon 888 */
-        {
-                .soc_model         = SM8350,
-                .htp_arch          = V68,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 888 "},
+static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.id = id;
+    }
+}
 
-        /* Qualcomm SnapDragon 8 Gen 1 */
-        {
-                .soc_model         = SM8450,
-                .htp_arch          = V69,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8 Gen 1"},
-
-        /* Qualcomm SnapDragon 8 Gen 1+ */
-        {
-                .soc_model         = SM8475,
-                .htp_arch          = V69,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8 Gen 1+"},
-
-        /* Qualcomm SnapDragon 8 Gen 2 */
-        {
-                .soc_model         = SM8550,
-                .htp_arch          = V73,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8 Gen 2"},
+static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.name = name;
+    }
+}
 
-        /* Qualcomm SnapDragon 8 Gen 3 */
-        {
-                .soc_model         = SM8650,
-                .htp_arch          = V75,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8 Gen 3 "},
+static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.type = type;
+    }
+}
 
-        /* Qualcomm SnapDragon 8 Gen 4 */
-        {
-                .soc_model         = SM8750,
-                .htp_arch          = V79,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8 Gen 4"},
+static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.dataFormat = format;
+    }
+}
 
-#if !defined(__ANDROID__) && !defined(__linux__)
-        /* Qualcomm SnapDragon 7c Gen 2 */
-        {
-                .soc_model         = SC7280X,
-                .htp_arch          = V68,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 7c Gen 2"},
+static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.dataType = dataType;
+    }
+}
 
-        /* Qualcomm SnapDragon 8cx Gen 3 */
-        {
-                .soc_model         = SC8280X,
-                .htp_arch          = V68,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8cx Gen 3"},
+static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.quantizeParams = params;
+    }
+}
 
-        /* Qualcomm SnapDragon 8cx Gen 4 */
-        {
-                .soc_model         = SC8380XP,
-                .htp_arch          = V73,
-                .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8cx Gen 4"},
-#endif
+static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.rank = rank;
+    }
+}
 
-};
+static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.dimensions = dims;
+    }
+}
 
+static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.memType = memType;
+    }
+}
 
-#if defined(__ANDROID__)
-static const char * g_qnn_runtimelib_path = "/data/local/tmp/";
-#elif defined(__linux__)
-static const char * g_qnn_runtimelib_path = "/tmp/";
-#elif defined(_WIN32)
-static const char * g_qnn_runtimelib_path = "C:\\";
-#else //cygwin on Windows
-static const char * g_qnn_runtimelib_path = "/cygdrive/c/";
-#endif
-//the following helper funcs are used to ensure every QNN tensor name is unique
-static std::atomic<int32_t>  g_ggmltensor_idx(0);
-static void reset_idx() {
-    g_ggmltensor_idx = 0;
+static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.clientBuf = clientBuf;
+    }
 }
 
-static void inc_idx() {
-    g_ggmltensor_idx++;
+static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) {
+    if (tensor.version == QNN_TENSOR_VERSION_1) {
+        tensor.v1.memHandle = handle;
+    }
 }
 
-static int32_t get_idx() {
-    return g_ggmltensor_idx.load();
+static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
+    int err = 0;
+
+    dst.version = src.version;
+    QNN_TENSOR_SET_NAME(dst, ggmlqnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size()));
+    if (nullptr == QNN_TENSOR_GET_NAME(dst)) {
+        return 1;
+    }
+    QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src));
+    QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src));
+    QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src));
+    QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src));
+    QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src));
+
+    if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) {
+        Qnn_ClientBuffer_t client_buf = {nullptr, 0};
+        QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf);
+    } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) {
+        QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr);
+    } else {
+        return 1;
+    }
+
+    Qnn_QuantizeParams_t src_qparam      = QNN_TENSOR_GET_QUANT_PARAMS(src);
+    Qnn_QuantizationEncoding_t encoding  = src_qparam.quantizationEncoding;
+    if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+        Qnn_QuantizeParams_t src_qparam_cpy       = src_qparam;
+        Qnn_AxisScaleOffset_t & axis_scale_offset = src_qparam_cpy.axisScaleOffsetEncoding;
+        Qnn_ScaleOffset_t ** scale_offset         = &axis_scale_offset.scaleOffset;
+        size_t scale_offset_size = axis_scale_offset.numScaleOffsets * sizeof(Qnn_ScaleOffset_t);
+        *scale_offset            = (Qnn_ScaleOffset_t *)malloc(scale_offset_size);
+        ggmlqnn_memscpy(*scale_offset,
+                        scale_offset_size,
+                        src_qparam.axisScaleOffsetEncoding.scaleOffset,
+                        scale_offset_size);
+        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy);
+    } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
+        Qnn_QuantizeParams_t src_qparam_cpy           = src_qparam;
+        Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding;
+        size_t scale_size                          = bwaxis_scale_offset.numElements * sizeof(float);
+        float ** scales                            = &bwaxis_scale_offset.scales;
+        int32_t ** offsets                         = &bwaxis_scale_offset.offsets;
+        *scales                                    = (float *)malloc(scale_size);
+        ggmlqnn_memscpy(*scales, scale_size, src_qparam.bwAxisScaleOffsetEncoding.scales, scale_size);
+
+        if (bwaxis_scale_offset.offsets != nullptr) {
+            size_t offset_size = bwaxis_scale_offset.numElements * sizeof(int32_t);
+            *offsets           = (int32_t *)malloc(offset_size);
+            ggmlqnn_memscpy(*offsets, offset_size, src_qparam.bwAxisScaleOffsetEncoding.offsets, offset_size);
+        }
+        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy);
+    } else {
+        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam);
+    }
+
+    uint32_t rank = QNN_TENSOR_GET_RANK(src);
+    QNN_TENSOR_SET_RANK(dst, rank);
+    size_t dim_size       = GGML_MAX_DIMS * sizeof(uint32_t);
+    uint32_t * dimensions = (uint32_t *)malloc(dim_size);
+    if (nullptr == dimensions) {
+        GGMLQNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src));
+        return 1;
+    }
+    ggmlqnn_memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size);
+    QNN_TENSOR_SET_DIMENSIONS(dst, dimensions);
+
+    return err;
 }
 
-// file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/quantization.html
-// CPU - Choose a non-quantized model.Quantized models are currently incompatible with the CPU backend
-// GPU - Choose a non-quantized model.Quantized models are currently incompatible with the GPU backend
-// HTP - Choose a quantized model. Quantized models are required when running on the HTP backend
-// DSP - Choose a quantized model. Quantized models are required when running on the DSP backend
-// HTA - Choose a quantized model. Quantized models are required when running on the HTA backend
-static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
-        [QNN_BACKEND_CPU] = {.device               = 0,
-                .threads              = 1,
-                .name                 = "qnn-cpu",
-                .desc                 = "Qualcomm Kryo CPU",
-#if !defined(__ANDROID__) && !defined(__linux__)
-                .lib                  = "QnnCpu.dll",
-#else
-                .lib                  = "libQnnCpu.so",
-#endif
-                .instance             = nullptr,
-                .backend              = nullptr,
-                .raw_interface        = {},
-                .raw_system_interface = {},
-                .socinfo              = {}},
+static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
+    int err = 0;
+    free((void *) QNN_TENSOR_GET_NAME(*tensor));
+    Qnn_QuantizeParams_t src_qparam     = QNN_TENSOR_GET_QUANT_PARAMS(*tensor);
+    Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding;
+    if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
+        free(src_qparam.axisScaleOffsetEncoding.scaleOffset);
+    } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
+        free(src_qparam.bwAxisScaleOffsetEncoding.scales);
+        if (src_qparam.bwAxisScaleOffsetEncoding.offsets != nullptr) {
+            free(src_qparam.bwAxisScaleOffsetEncoding.offsets);
+        }
+    }
+    free(QNN_TENSOR_GET_DIMENSIONS(*tensor));
+    free(tensor);
 
-        [QNN_BACKEND_GPU] = {.device               = 1,
-                .threads              = 1,
-                .name                 = "qnn-gpu",
-                .desc                 = "Qualcomm Adreno GPU",
-#if !defined(__ANDROID__) && !defined(__linux__)
-                .lib                  = "QnnGpu.dll",
-#else
-                .lib                  = "libQnnGpu.so",
-#endif
-                .instance             = nullptr,
-                .backend              = nullptr,
-                .raw_interface        = {},
-                .raw_system_interface = {},
-                .socinfo              = {}},
+    return err;
+}
 
-        [QNN_BACKEND_NPU] = {.device               = 2,
-                .threads              = 1,
-                .name                 = "qnn-npu",
-                .desc                 = "Qualcomm NPU(Hexagon Tensor Processor)",
-#if !defined(__ANDROID__) && !defined(__linux__)
-                .lib                  = "QnnHtp.dll",
-#else
-                .lib                  = "libQnnHtp.so",
-#endif
-                .instance             = nullptr,
-                .backend              = nullptr,
-                .raw_interface        = {},
-                .raw_system_interface = {},
-                .socinfo              = {}},
-};
+static const char * ggmlqnn_get_qnnerror_string(Qnn_ErrorHandle_t qnn_error_code) {
+    // file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/api_error_codes.html
+    switch (qnn_error_code) {
+        case QNN_SUCCESS:
+            return "QNN_SUCCESS";
+        case QNN_COMMON_ERROR_GENERAL:
+            return "QNN_COMMON_ERROR_GENERAL";
 
-static const qnn_op_caps_t ggmlqnn_k_op_caps[] = {
-        {}, // GGML_OP_NONE
-        {}, // GGML_OP_DUP
-        {
-                // GGML_OP_ADD
-                QNN_OP_ELEMENT_WISE_ADD,
-                2,
-        },
-        {}, // GGML_OP_ADD1
-        {}, // GGML_OP_ACC
-        {}, // GGML_OP_SUB
-        {
-                // GGML_OP_MUL
-                QNN_OP_ELEMENT_WISE_MULTIPLY,
-                2,
-        },
-        {}, // GGML_OP_DIV
-        {}, // GGML_OP_SQR
-        {}, // GGML_OP_SQRT
-        {}, // GGML_OP_LOG
-        {}, // GGML_OP_SIN
-        {}, // GGML_OP_COS
-        {}, // GGML_OP_SUM
-        {}, // GGML_OP_SUM_ROWS
-        {}, // GGML_OP_MEAN
-        {}, // GGML_OP_ARGMAX
-        {}, // GGML_OP_COUNT_EQUAL
-        {}, // GGML_OP_REPEAT
-        {}, // GGML_OP_REPEAT_BACK
-        {}, // GGML_OP_CONCAT
-        {}, // GGML_OP_SILU_BACK
-        {}, // GGML_OP_NORM
-        {}, // GGML_OP_RMS_NORM
-        {}, // GGML_OP_RMS_NORM_BACK
-        {}, // GGML_OP_GROUP_NORM
-        {
-                // GGML_OP_MUL_MAT
-                QNN_OP_MAT_MUL,
-                2,
-        },
-        {}, // GGML_OP_MUL_MAT_ID
-        {}, // GGML_OP_OUT_PROD
-        {}, // GGML_OP_SCALE
-        {}, // GGML_OP_SET
-        {}, // GGML_OP_CPY
-        {}, // GGML_OP_CONT
-        {}, // GGML_OP_RESHAPE
-        {}, // GGML_OP_VIEW
-        {}, // GGML_OP_PERMUTE
-        {}, // GGML_OP_TRANSPOSE
-        {}, // GGML_OP_GET_ROWS
-        {}, // GGML_OP_GET_ROWS_BACK
-        {}, // GGML_OP_DIAG
-        {}, // GGML_OP_DIAG_MASK_INF
-        {}, // GGML_OP_DIAG_MASK_ZERO
-        {}, // GGML_OP_SOFT_MAX
-        {}, // GGML_OP_SOFT_MAX_BACK
-        {}, // GGML_OP_ROPE
-        {}, // GGML_OP_ROPE_BACK
-        {}, // GGML_OP_CLAMP
-        {}, // GGML_OP_CONV_TRANSPOSE_1D
-        {}, // GGML_OP_IM2COL
-        {}, // GGML_OP_IM2COL_BACK
-        {}, // GGML_OP_CONV_TRANSPOSE_2D
-        {}, // GGML_OP_POOL_1D
-        {}, // GGML_OP_POOL_2D
-        {}, // GGML_OP_POOL_2D_BACK
-        {}, // GGML_OP_UPSCALE
-        {}, // GGML_OP_PAD
-        {}, // GGML_OP_PAD_REFLECT_1D
-        {}, // GGML_OP_ARANGE
-        {}, // GGML_OP_TIMESTEP_EMBEDDING
-        {}, // GGML_OP_ARGSORT
-        {}, // GGML_OP_LEAKY_RELU
-        {}, // GGML_OP_FLASH_ATTN_EXT
-        {}, // GGML_OP_FLASH_ATTN_BACK
-        {}, // GGML_OP_SSM_CONV
-        {}, // GGML_OP_SSM_SCAN
-        {}, // GGML_OP_WIN_PART
-        {}, // GGML_OP_WIN_UNPART
-        {}, // GGML_OP_GET_REL_POS
-        {}, // GGML_OP_ADD_REL_POS
-        {}, // GGML_OP_RWKV_WKV6
-        {}, // GGML_OP_GATED_LINEAR_ATTN
-        {}, // GGML_OP_UNARY
-        {}, // GGML_OP_MAP_UNARY
-        {}, // GGML_OP_MAP_BINARY
-        {}, // GGML_OP_MAP_CUSTOM1_F32
-        {}, // GGML_OP_MAP_CUSTOM2_F32
-        {}, // GGML_OP_MAP_CUSTOM3_F32
-        {}, // GGML_OP_MAP_CUSTOM1
-        {}, // GGML_OP_MAP_CUSTOM2
-        {}, // GGML_OP_MAP_CUSTOM3
-        {}, // GGML_OP_CROSS_ENTROPY_LOSS
-        {}, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
-        {}, // GGML_OP_OPT_STEP_ADAMW
-        {}, // GGML_UNARY_OP_ABS
-        {}, // GGML_UNARY_OP_SGN
-        {}, // GGML_UNARY_OP_NEG
-        {}, // GGML_UNARY_OP_STEP
-        {}, // GGML_UNARY_OP_TANH
-        {}, // GGML_UNARY_OP_ELU
-        {}, // GGML_UNARY_OP_RELU
-        {}, // GGML_UNARY_OP_SIGMOID
-        {}, // GGML_UNARY_OP_GELU
-        {}, // GGML_UNARY_OP_GELU_QUICK
-        {}, // GGML_UNARY_OP_SILU
-        {}, // GGML_UNARY_OP_HARDSWISH
-        {}, // GGML_UNARY_OP_HARDSIGMOID
-        {}, // GGML_UNARY_OP_EXP
-};
+            // QnnGraph_Error_t
+        case QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE:
+            return "QNN_GRAPH_ERROR_UNSUPPORTED_FEATURE";
+        case QNN_GRAPH_ERROR_MEM_ALLOC:
+            return "QNN_GRAPH_ERROR_MEM_ALLOC";
+        case QNN_GRAPH_ERROR_INVALID_ARGUMENT:
+            return "QNN_GRAPH_ERROR_INVALID_ARGUMENT";
+        case QNN_GRAPH_ERROR_INVALID_HANDLE:
+            return "QNN_GRAPH_ERROR_INVALID_HANDLE";
+        case QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST:
+            return "QNN_GRAPH_ERROR_GRAPH_DOES_NOT_EXIST";
+        case QNN_GRAPH_ERROR_INVALID_NAME:
+            return "QNN_GRAPH_ERROR_INVALID_NAME";
+        case QNN_GRAPH_ERROR_INVALID_TENSOR:
+            return "QNN_GRAPH_ERROR_INVALID_TENSOR";
+        case QNN_GRAPH_ERROR_INVALID_OP_CONFIG:
+            return "QNN_GRAPH_ERROR_INVALID_OP_CONFIG";
+        case QNN_GRAPH_ERROR_SET_PROFILE:
+            return "QNN_GRAPH_ERROR_SET_PROFILE";
+        case QNN_GRAPH_ERROR_UNCONNECTED_NODE:
+            return "QNN_GRAPH_ERROR_UNCONNECTED_NODE";
+        case QNN_GRAPH_ERROR_CREATE_FAILED:
+            return "QNN_GRAPH_ERROR_CREATE_FAILED";
+        case QNN_GRAPH_ERROR_OPTIMIZATION_FAILED:
+            return "QNN_GRAPH_ERROR_OPTIMIZATION_FAILED";
+        case QNN_GRAPH_ERROR_FINALIZE_FAILED:
+            return "QNN_GRAPH_ERROR_FINALIZE_FAILED";
+        case QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED:
+            return "QNN_GRAPH_ERROR_GRAPH_NOT_FINALIZED";
+        case QNN_GRAPH_ERROR_GRAPH_FINALIZED:
+            return "QNN_GRAPH_ERROR_GRAPH_FINALIZED";
+        case QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL:
+            return "QNN_GRAPH_ERROR_EXECUTION_ASYNC_FIFO_FULL";
+        case QNN_GRAPH_ERROR_SIGNAL_IN_USE:
+            return "QNN_GRAPH_ERROR_SIGNAL_IN_USE";
+        case QNN_GRAPH_ERROR_ABORTED:
+            return "QNN_GRAPH_ERROR_ABORTED";
+        case QNN_GRAPH_ERROR_PROFILE_IN_USE:
+            return "QNN_GRAPH_ERROR_PROFILE_IN_USE";
+        case QNN_GRAPH_ERROR_TIMED_OUT:
+            return "QNN_GRAPH_ERROR_TIMED_OUT";
+        case QNN_GRAPH_ERROR_SUBGRAPH:
+            return "QNN_GRAPH_ERROR_SUBGRAPH";
+        case QNN_GRAPH_ERROR_DISABLED:
+            return "QNN_GRAPH_ERROR_DISABLED";
+        case QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE:
+            return "QNN_GRAPH_ERROR_DYNAMIC_TENSOR_SHAPE";
+        case QNN_GRAPH_ERROR_TENSOR_SPARSITY:
+            return "QNN_GRAPH_ERROR_TENSOR_SPARSITY";
+        case QNN_GRAPH_ERROR_EARLY_TERMINATION:
+            return "QNN_GRAPH_ERROR_EARLY_TERMINATION";
+        case QNN_GRAPH_ERROR_INVALID_CONTEXT:
+            return "QNN_GRAPH_ERROR_INVALID_CONTEXT";
+
+            //QQnnTensor_Error_t
+            //Invalid context/graph handle in creating tensor
+        case QNN_TENSOR_ERROR_INVALID_HANDLE:
+            return "QNN_TENSOR_ERROR_INVALID_HANDLE";
+            //Tensor with specified credentials not registered with a context/graph
+        case QNN_TENSOR_ERROR_DOES_NOT_EXIST:
+            return "QNN_TENSOR_ERROR_DOES_NOT_EXIST";
+            // (deprecated) Tensor has already been registered with backend
+        case QNN_TENSOR_ERROR_ALREADY_EXISTS:
+            return "QNN_TENSOR_ERROR_ALREADY_EXISTS";
+            // Invalid tensor param.
+        case QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM:
+            return "QNN_TENSOR_ERROR_INVALID_TENSOR_PARAM";
+            // This tensor param is currently unsupported
+        case QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM:
+            return "QNN_TENSOR_ERROR_UNSUPPORTED_TENSOR_PARAM";
+            // Tensor provided for update is invalid
+        case QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE:
+            return "QNN_TENSOR_ERROR_INCOMPATIBLE_TENSOR_UPDATE";
+
+            // QnnOpPackage_Error_t
+        case QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED:
+            return "QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED";
+        case QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED:
+            return "QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED";
+        case QNN_OP_PACKAGE_ERROR_INVALID_HANDLE:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_HANDLE";
+        case QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_INFRASTRUCTURE";
+        case QNN_OP_PACKAGE_ERROR_INVALID_INFO:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_INFO";
+        case QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE:
+            return "QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE";
+        case QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT:
+            return "QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT";
+
+        default:
+            return "unknown QNN error";
+    }
+}
+
+static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
+                                       Qnn_Param_t * params, uint32_t num_params,
+                                       Qnn_Tensor_t * inputs, uint32_t num_inputs,
+                                       Qnn_Tensor_t * outputs, uint32_t num_outputs) {
+    Qnn_OpConfigV1_t v1 = {name, package, type,
+                           num_params, params,
+                           num_inputs, inputs,
+                           num_outputs, outputs
+    };
+    Qnn_OpConfig_t opcfg = {QNN_OPCONFIG_VERSION_1, {v1}};
+
+    return opcfg;
+}
 
-static const char * qnn_get_socmodel_desc(uint32_t soc_model) {
+// =================================================================================================
+//  section-7:ggml-qnn backend helper function / class
+// =================================================================================================
+static const char * ggmlqnn_get_socmodel_desc(uint32_t soc_model) {
     switch (soc_model) {
         case SM7450:
             return "SM7450";
@@ -1180,7 +1370,7 @@ static const char * qnn_get_socmodel_desc(uint32_t soc_model) {
     }
 }
 
-static const char * qnn_get_htparch_desc(size_t htp_arch) {
+static const char * ggmlqnn_get_htparch_desc(size_t htp_arch) {
     switch (htp_arch) {
         case V68:
             return "QCOM_HTP_V68";
@@ -1207,13 +1397,7 @@ static struct qcom_socinfo * ggmlqnn_get_socinfo_from_socmodel(uint32_t soc_mode
     return nullptr;
 }
 
-
-static const char * ggml_get_type_name(ggml_type type) {
-    const struct ggml_type_traits * traits = ggml_get_type_traits(type);
-    return traits->type_name;
-}
-
-static const char * get_ggml_type_name(ggml_type type) {
+static const char * ggmlqnn_get_ggml_type_name(ggml_type type) {
     const auto * traits = ggml_get_type_traits(type);
     return traits->type_name;
 }
@@ -1260,7 +1444,7 @@ static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
     return GGML_TYPE_COUNT;
 }
 
-static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, const uint32_t * ggml_dimensions, uint32_t rank) {
+static void ggmlqnn_get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, const uint32_t * ggml_dimensions, uint32_t rank) {
     if (rank > GGML_MAX_DIMS) {
         GGMLQNN_LOG_WARN("invalid params");
         return;
@@ -1278,7 +1462,6 @@ static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, c
     }
 }
 
-
 static void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     const ggml_tensor * src0        = op->src[0];
     const ggml_tensor * src1        = op->src[1];
@@ -1347,9 +1530,9 @@ static void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * o
     return wdata;
 }
 
-static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) {
+static void ggmlqnn_append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) {
     char buffer[256] = {};
-    const char * type_name = get_ggml_type_name(tensor->type);
+    const char * type_name = ggmlqnn_get_ggml_type_name(tensor->type);
     int len = 0;
     switch (ggml_n_dims(tensor)) {
         case 1:
@@ -1393,7 +1576,7 @@ static size_t ggmlqnn_get_op_input_param_count(const ggml_tensor * op) {
 static void ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & output) {
     GGML_ASSERT(op->op != GGML_OP_NONE);
     output += ggml_op_desc(op);
-    output += get_ggml_type_name(op->type);
+    output += ggmlqnn_get_ggml_type_name(op->type);
     size_t param_count = ggmlqnn_get_op_input_param_count(op);
     for (size_t i = 0; i < param_count; ++i) {
         auto * input = op->src[i];
@@ -1401,12 +1584,61 @@ static void ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & o
             break;
         }
         output += '_';
-        append_tensor_dimensions(input, output);
+        ggmlqnn_append_tensor_dimensions(input, output);
+    }
+}
+
+static void ggmlqnn_get_opkey_with_srcop_desc(const ggml_tensor * op, std::string & output) {
+    output += ggml_op_desc(op);
+    output += '(';
+    if (op->src[0]) {
+        output += ggml_op_desc(op->src[0]);
+    }
+    for (size_t i = 1; i < GGML_MAX_DIMS && op->src[i]; ++i) {
+        output += ',';
+        output += ggml_op_desc(op->src[i]);
+    }
+    output += ')';
+}
+
+static void ggmlqnn_get_graphkey_from_cgraph(const ggml_cgraph * cgraph, std::string & output) {
+    if (nullptr == cgraph || 0 == cgraph->n_nodes) {
+        GGMLQNN_LOG_WARN("empty ggml computational graph");
+        return;
+    }
+
+    bool is_start = true;
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        auto * op = cgraph->nodes[i];
+        if (ggml_is_empty(op)) {
+            GGMLQNN_LOG_WARN("empty op in graph, skipping");
+            continue;
+        }
+
+        if (op->op == GGML_OP_NONE) {
+            GGMLQNN_LOG_WARN("GGML_OP_NONE in graph, skipping");
+            continue;
+        }
+
+        if (is_start) {
+            ggmlqnn_get_graphkey_from_op(cgraph->nodes[0], output);
+            is_start = false;
+        } else {
+            output += '#';
+            ggmlqnn_get_opkey_with_srcop_desc(op, output);
+        }
+    }
+
+    if (cgraph->n_nodes > 1) {
+        auto * last_op = cgraph->nodes[cgraph->n_nodes - 1];
+        output += ggmlqnn_get_ggml_type_name(last_op->type);
+        output += '_';
+        ggmlqnn_append_tensor_dimensions(last_op, output);
     }
 }
 
 template<typename Fn>
-Fn load_qnn_functionpointers(void * handle, const char * function_name) {
+Fn ggmlqnn_load_qnn_functionpointers(void * handle, const char * function_name) {
     return reinterpret_cast<Fn>(dlsym(handle, function_name));
 }
 
@@ -1667,7 +1899,17 @@ class qnn_instance {
 
     void * alloc_rpcmem_internal(size_t bytes, size_t alignment);
 
-    void probe_device_meminfo();
+    void htp_print_info();
+
+    void htp_probe_device_meminfo();
+
+    void print_backend_info();
+
+    void htp_set_memory_grow_size(size_t size = 1ul * 1024 * 1024);
+
+    void htp_enter_performance_mode();
+
+    void htp_set_n_hvx_threads(size_t n_threads);
 
 private:
     static constexpr const int _required_num_providers = 1;
@@ -1685,6 +1927,8 @@ class qnn_instance {
     ggml_qnn_profile_level _profile_level   = ggml_qnn_profile_level::profile_detail;
 
     void * _system_lib_handle               = nullptr;
+    void * _loaded_lib_handle               = nullptr;
+    const QnnInterface_t * _loaded_backend  = nullptr;
 
     Qnn_GraphHandle_t _qnn_graph_handle     = nullptr;
 
@@ -1701,8 +1945,11 @@ class qnn_instance {
     QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
 
     QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr;
-    uint32_t _qnn_power_configid            = 1;
-    uint32_t _qnn_rpc_pollingtime           = 9999; // 0-10000 us for high performing
+    uint32_t _qnn_htp_powerconfig_id  = 1;
+    uint32_t _qnn_htp_device_id       = 0;
+    uint32_t _qnn_htp_core_id         = 0;
+
+    uint32_t _qnn_rpc_pollingtime     = 9999; // 0-10000 us for high performing
 
     qnn_interface _qnn_interface;
     QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
@@ -1711,11 +1958,6 @@ class qnn_instance {
     std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
     std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
 
-    static std::mutex _init_mutex;
-    static std::unordered_map<BackendIdType, void *> _loaded_lib_handle;
-    static std::unordered_map<std::string, BackendIdType> _lib_path_to_backend_id;
-    static std::unordered_map<BackendIdType, const QnnInterface_t *> _loaded_backend;
-
     std::atomic_bool _rpcmem_initialized{false};
     pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
     pfn_rpc_mem_free _pfn_rpc_mem_free;
@@ -1736,12 +1978,6 @@ class qnn_instance {
     DISABLE_MOVE(qnn_instance);
 };
 
-
-std::mutex qnn_instance::_init_mutex;
-std::unordered_map<qnn_instance::BackendIdType, void *> qnn_instance::_loaded_lib_handle;
-std::unordered_map<std::string, qnn_instance::BackendIdType> qnn_instance::_lib_path_to_backend_id;
-std::unordered_map<qnn_instance::BackendIdType, const QnnInterface_t *> qnn_instance::_loaded_backend;
-
 void * qnn_instance::alloc_rpcmem_internal(size_t bytes, size_t alignment) {
     if (!_rpcmem_initialized) {
         GGMLQNN_LOG_WARN("rpc memory not initialized\n");
@@ -1990,7 +2226,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
         return 1;
     }
 
-    auto get_providers = load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(
+    auto get_providers = ggmlqnn_load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(
                                lib_handle,
                                "QnnInterface_getProviders");
     if (nullptr == get_providers) {
@@ -1998,7 +2234,6 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
         return 2;
     }
 
-    // get QnnInterface Providers
     std::uint32_t num_providers = 0;
     const QnnInterface_t ** provider_list = nullptr;
     error = get_providers(&provider_list, &num_providers);
@@ -2036,25 +2271,12 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
     set_qnn_raw_interface(qnn_interface);
 
     BackendIdType backend_id = provider_list[0]->backendId;
-    _lib_path_to_backend_id[lib_path] = backend_id;
-    if (_loaded_backend.count(backend_id) > 0) {
-        GGMLQNN_LOG_WARN("lib_path %s is loaded, but backend %d already exists\n",
-              lib_path.c_str(), backend_id);
-    }
-    _loaded_backend[backend_id] = provider_list[0];
-    if (_loaded_lib_handle.count(backend_id) > 0) {
-        GGMLQNN_LOG_WARN("closing %p\n", _loaded_lib_handle[backend_id]);
-        int dlclose_error = dlclose(_loaded_lib_handle[backend_id]);
-        if (dlclose_error != 0) {
-            GGMLQNN_LOG_WARN("fail to close %p with error %s\n", _loaded_lib_handle[backend_id], dlerror());
-        }
-    }
-    _loaded_lib_handle[backend_id] = lib_handle;
-    _backend_id = backend_id;
+    _loaded_backend     = provider_list[0];
+    _loaded_lib_handle  = lib_handle;
+    _backend_id         = backend_id;
 
     auto saver_initialize =
-            load_qnn_functionpointers<_pfn_QnnSaver_initialize *>(
-            _loaded_lib_handle[backend_id], "QnnSaver_initialize");
+            ggmlqnn_load_qnn_functionpointers<_pfn_QnnSaver_initialize *>(_loaded_lib_handle, "QnnSaver_initialize");
     if (nullptr != saver_initialize) {
         error = saver_initialize(saver_config);
         if (error != QNN_SUCCESS) {
@@ -2070,17 +2292,11 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
 
 int qnn_instance::unload_backend() {
     int dlclose_error = 0;
-    for (auto & it : _loaded_lib_handle) {
-        dlclose_error = dlclose(it.second);
-        if (dlclose_error != 0) {
-            GGMLQNN_LOG_WARN("failed to close QNN backend %d, error %s\n", it.first, dlerror());
-        }
+    dlclose_error = dlclose(_loaded_lib_handle);
+    if (dlclose_error != 0) {
+        GGMLQNN_LOG_WARN("failed to close QNN backend %d, error %s\n", _backend_id, dlerror());
     }
 
-    _loaded_lib_handle.clear();
-    _lib_path_to_backend_id.clear();
-    _loaded_backend.clear();
-
     return 0;
 }
 
@@ -2195,12 +2411,14 @@ int qnn_instance::unload_system() {
     return result;
 }
 
-#if GGMLQNN_PRINT_QNN_INTERNAL_LOG
 static void ggml_qnn_logcallback(const char * fmt,
                                  QnnLog_Level_t level,
                                  uint64_t timestamp,
                                  va_list argp) {
 
+    if (0 == g_print_qnn_internal_log)
+        return;
+
     static std::mutex log_mutex;
     static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN];
 
@@ -2234,39 +2452,12 @@ static void ggml_qnn_logcallback(const char * fmt,
         GGMLQNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
     }
 }
-#else
-static void ggml_qnn_logcallback(const char * fmt,
-                                 QnnLog_Level_t level,
-                                 uint64_t timestamp,
-                                 va_list argp) {
-    GGML_UNUSED(fmt);
-    GGML_UNUSED(level);
-    GGML_UNUSED(timestamp);
-    GGML_UNUSED(argp);
-}
-#endif
 
 int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
     BackendIdType backend_id = QNN_BACKEND_ID_NULL;
     GGMLQNN_LOG_DEBUG("enter qni_init\n");
-    const std::lock_guard<std::mutex> lock(_init_mutex);
-    if (0 != load_system()) {
-        GGMLQNN_LOG_WARN("can not load QNN system lib, pls check why?\n");
-        return 1;
-    } else {
-        GGMLQNN_LOG_DEBUG("load QNN system lib successfully\n");
-    }
-
-    std::string backend_lib_path = _lib_path + _backend_name;
-    if (0 == _lib_path_to_backend_id.count(backend_lib_path)) {
-        int is_load_ok = load_backend(backend_lib_path, saver_config);
-        if (0 != is_load_ok) {
-            GGMLQNN_LOG_WARN("failed to load QNN backend\n");
-            return 2;
-        }
-    }
 
-    _device_id = QNN_BACKEND_CPU;
+    _device_id = QNN_BACKEND_GGML;
     if (_backend_name.find("QnnCpu") != std::string::npos) {
         _device_id = QNN_BACKEND_CPU;
     }
@@ -2276,17 +2467,27 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
     if (_backend_name.find("QnnHtp") != std::string::npos) {
         _device_id = QNN_BACKEND_NPU;
     }
+    if (QNN_BACKEND_GGML == _device_id) {
+        GGMLQNN_LOG_INFO("user specified qnn backend is ggml, skip QNN initialize");
+        return 0;
+    }
 
-    backend_id = _lib_path_to_backend_id[backend_lib_path];
-    if (0 == _loaded_backend.count(backend_id) ||
-        0 == _loaded_lib_handle.count(backend_id)) {
-        GGMLQNN_LOG_WARN("library %s is loaded but loaded backend count=%zu, loaded lib_handle count=%zu\n",
-              backend_lib_path.c_str(),
-              _loaded_backend.count(backend_id),
-              _loaded_lib_handle.count(backend_id));
-        return 3;
+    if (0 != load_system()) {
+        GGMLQNN_LOG_WARN("can not load QNN system lib, pls check why?\n");
+        return 1;
+    } else {
+        GGMLQNN_LOG_DEBUG("load QNN system lib successfully\n");
+    }
+
+    std::string backend_lib_path = _lib_path + _backend_name;
+
+    int is_load_ok = load_backend(backend_lib_path, saver_config);
+    if (0 != is_load_ok) {
+        GGMLQNN_LOG_WARN("failed to load QNN backend\n");
+        return 2;
     }
-    _qnn_interface.set_qnn_interface(_loaded_backend[backend_id]);
+
+    _qnn_interface.set_qnn_interface(_loaded_backend);
 #if 1
     _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle);
 #else
@@ -2294,7 +2495,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
 #endif
     if (nullptr == _qnn_log_handle) {
         GGMLQNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone
-        return 4;
+        return 3;
     } else {
         GGMLQNN_LOG_DEBUG("initialize qnn log successfully\n");
     }
@@ -2305,7 +2506,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
                       &_qnn_backend_handle);
     if (nullptr == _qnn_backend_handle) {
         GGMLQNN_LOG_WARN("why failed to initialize qnn backend\n");
-        return 5;
+        return 4;
     } else {
         GGMLQNN_LOG_DEBUG("initialize qnn backend successfully\n");
     }
@@ -2335,7 +2536,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
             if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
                     _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) {
                 GGMLQNN_LOG_WARN("unable to create profile handle in the backend\n");
-                return 6;
+                return 5;
             } else {
                 GGMLQNN_LOG_DEBUG("initialize qnn profile successfully\n");
             }
@@ -2344,7 +2545,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
             if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
                     _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) {
                 GGMLQNN_LOG_WARN("unable to create profile handle in the backend\n");
-                return 7;
+                return 6;
             } else {
                 GGMLQNN_LOG_DEBUG("initialize qnn profile successfully\n");
             }
@@ -2358,7 +2559,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
 #endif
     if (nullptr == _rpc_lib_handle) {
         GGMLQNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror());
-        return 8;
+        return 7;
     } else {
         GGMLQNN_LOG_DEBUG("load rpcmem lib successfully\n");
         set_rpcmem_initialized(true);
@@ -2372,7 +2573,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         || nullptr == _pfn_rpc_mem_to_fd) {
         GGMLQNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror());
         dlclose(_rpc_lib_handle);
-        return 9;
+        return 8;
     }
 
     if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy
@@ -2384,51 +2585,30 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
                                &_qnn_context_handle);
     if (nullptr == _qnn_context_handle) {
         GGMLQNN_LOG_WARN("why failed to initialize qnn context, error:%s\n", strerror(errno));
-        return 10;
+        return 9;
     } else {
         GGMLQNN_LOG_DEBUG("initialize qnn context successfully\n");
     }
 
     if (_backend_name.find("Htp") != std::string::npos) {
-        const QnnDevice_PlatformInfo_t * p_info = nullptr;
-        _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
-        GGMLQNN_LOG_INFO("device counts %d", p_info->v1.numHwDevices);
-        QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
-        for (size_t i = 0; i < p_info->v1.numHwDevices; i++) {
-            GGMLQNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId,
-                         infos[i].v1.deviceType, infos[i].v1.numCores);
-            QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
-            QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice;
-            QnnHtpDevice_Arch_t htp_arch = chipinfo.arch;
-            GGMLQNN_LOG_INFO("htp_type:%d(%s)", devinfo->devType,
-                             (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "QNN_HTP_DEVICE_TYPE_ON_CHIP" : "QNN_HTP_DEVICE_TYPE_UNKNOWN");
-            GGMLQNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB", \
-                             chipinfo.socModel, qnn_get_socmodel_desc(chipinfo.socModel), \
-                             htp_arch, qnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize);
-            struct qcom_socinfo * socinfo = ggmlqnn_get_socinfo_from_socmodel(chipinfo.socModel);
-            g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {}};
-            if (nullptr != socinfo) {
-                memcpy(g_qnn_mgr[QNN_BACKEND_NPU].socinfo.soc_desc, socinfo->soc_desc, sizeof(socinfo->soc_desc));
-                GGMLQNN_LOG_INFO("soc info:%s", socinfo->soc_desc);
-            } else {
-                memcpy(g_qnn_mgr[QNN_BACKEND_NPU].socinfo.soc_desc, "unknown", 7);
-                GGMLQNN_LOG_INFO("soc info:unknown");
-            }
-        }
-        _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
+        htp_print_info();
 
-        probe_device_meminfo();
+        htp_probe_device_meminfo();
 
         if (0 != init_htp_perfinfra()) {
             GGMLQNN_LOG_WARN("initialize HTP performance failure");
         }
+#if 0
         if (0 != set_rpc_polling()) {
             GGMLQNN_LOG_WARN("set RPC polling failure");
         }
         if (0 != set_high_performance_mode()) {
             GGMLQNN_LOG_WARN("set HTP high performance mode failure");
         }
-
+#else
+        htp_set_memory_grow_size();
+        htp_enter_performance_mode();
+#endif
         if (enable_qnn_rpc()) {
             GGMLQNN_LOG_INFO("NPU RPC feature enabled");
         } else {
@@ -2436,6 +2616,8 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         }
     }
 
+    print_backend_info();
+
     GGMLQNN_LOG_DEBUG("leave qni_init\n");
 
     return 0;
@@ -2446,7 +2628,7 @@ int qnn_instance::qnn_finalize() {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
     GGMLQNN_LOG_DEBUG("enter %s\n", __func__);
-    reset_idx();
+    ggmqnn_reset_tensoridx();
 
     free_rpcmem();
     unregister_rpcmem();
@@ -2555,8 +2737,15 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend devi
         graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
         graph_vtcm_config.customConfig = &vtcm_config;
 
+        QnnHtpGraph_CustomConfig_t fp16_config;
+        fp16_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
+        fp16_config.precision = QNN_PRECISION_FLOAT16;
+        QnnGraph_Config_t graph_fp16_config;
+        graph_fp16_config.option       = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_fp16_config.customConfig = &fp16_config;
+
         const QnnGraph_Config_t * graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config,
-                                                    &graph_opt_config, nullptr};
+                                                    &graph_opt_config, &graph_fp16_config, nullptr};
         error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), graph_configs, &graph_handle);
     } else {
         error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), nullptr, &graph_handle);
@@ -2565,12 +2754,15 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend devi
     if (error != QNN_SUCCESS) {
         GGMLQNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s",
                       ggml_backend_qnn_get_devname(device), graph_name.c_str(),
-                      ggmlqnn_get_error_string(error));
+                      ggmlqnn_get_qnnerror_string(error));
         return error;
     }
 
     GGMLQNN_LOG_DEBUG("[%s]create graph %s succeed", ggml_backend_qnn_get_devname(device), graph_name.c_str());
     _qnn_graph_handle = graph_handle;
+    if (device == QNN_BACKEND_NPU) {
+        htp_set_n_hvx_threads(hvx_threads);
+    }
     return QNN_SUCCESS;
 }
 
@@ -2636,11 +2828,14 @@ int qnn_instance::init_htp_perfinfra() {
     QnnHtpDevice_Infrastructure_t * htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
     QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra;
     uint32_t power_configid = 1;
-    uint32_t device_id = 0;
-    uint32_t core_id = 0;
+    uint32_t device_id      = 0;
+    uint32_t core_id        = 0;
     htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid);
-    _qnn_htp_perfinfra = htp_perfinfra;
-    _qnn_power_configid = power_configid;
+    _qnn_htp_perfinfra      = htp_perfinfra;
+    _qnn_htp_powerconfig_id = power_configid;
+    //FIXME:hardcode to 0 and 0 although it's correct
+    _qnn_htp_device_id      = device_id;
+    _qnn_htp_core_id        = core_id;
 
     return 0;
 }
@@ -2653,7 +2848,7 @@ int qnn_instance::set_rpc_polling() {
         rpc_pollingtime.rpcPollingTimeConfig = _qnn_rpc_pollingtime;
         const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&rpc_pollingtime, nullptr};
         if (_qnn_htp_perfinfra) {
-            _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
+            _qnn_htp_perfinfra->setPowerConfig(_qnn_htp_powerconfig_id, power_configs);
         }
     }
     return 0;
@@ -2670,7 +2865,7 @@ int qnn_instance::set_high_performance_mode() {
     power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
     power_config.dcvsV3Config.dcvsEnable = 0;
     power_config.dcvsV3Config.setDcvsEnable = 1;
-    power_config.dcvsV3Config.contextId = _qnn_power_configid;
+    power_config.dcvsV3Config.contextId = _qnn_htp_powerconfig_id;
     power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
     power_config.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False
     power_config.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False
@@ -2691,12 +2886,43 @@ int qnn_instance::set_high_performance_mode() {
     // set power config with different performance parameters
     const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&power_config, nullptr};
 
-    _qnn_htp_perfinfra->setPowerConfig(_qnn_power_configid, power_configs);
+    _qnn_htp_perfinfra->setPowerConfig(_qnn_htp_powerconfig_id, power_configs);
 
     return 0;
 }
 
-void qnn_instance::probe_device_meminfo() {
+void qnn_instance::htp_print_info() {
+    const QnnDevice_PlatformInfo_t * p_info = nullptr;
+    _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
+    GGMLQNN_LOG_INFO("HTP device counts %d", p_info->v1.numHwDevices);
+    QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
+    for (size_t i = 0; i < p_info->v1.numHwDevices; i++) {
+        GGMLQNN_LOG_INFO("HTP deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId,
+                         infos[i].v1.deviceType, infos[i].v1.numCores);
+        QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
+        QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice;
+        QnnHtpDevice_Arch_t htp_arch = chipinfo.arch;
+        GGMLQNN_LOG_INFO("HTP_TYPE:%d(%s)", devinfo->devType,
+                         (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "QNN_HTP_DEVICE_TYPE_ON_CHIP" : "QNN_HTP_DEVICE_TYPE_UNKNOWN");
+        GGMLQNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB，" \
+                             "dlbc_support:%d, signedpd_support:%d", \
+                             chipinfo.socModel, ggmlqnn_get_socmodel_desc(chipinfo.socModel), \
+                             htp_arch, ggmlqnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize, \
+                             chipinfo.dlbcSupport, chipinfo.signedPdSupport);
+        struct qcom_socinfo * socinfo = ggmlqnn_get_socinfo_from_socmodel(chipinfo.socModel);
+        g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {}};
+        if (nullptr != socinfo) {
+            memcpy(g_qnn_mgr[QNN_BACKEND_NPU].socinfo.soc_desc, socinfo->soc_desc, sizeof(socinfo->soc_desc));
+            GGMLQNN_LOG_INFO("soc info:%s", socinfo->soc_desc);
+        } else {
+            memcpy(g_qnn_mgr[QNN_BACKEND_NPU].socinfo.soc_desc, "unknown", 7);
+            GGMLQNN_LOG_INFO("soc info:unknown");
+        }
+    }
+    _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
+}
+
+void qnn_instance::htp_probe_device_meminfo() {
     size_t candidate_size   = 0;
     uint8_t * rpc_buffer    = nullptr;
     const int SIZE_IN_MB    = (1 << 20);
@@ -2721,6 +2947,140 @@ void qnn_instance::probe_device_meminfo() {
     GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
 }
 
+void qnn_instance::print_backend_info() {
+    auto print_property = [&](const char * name, QnnProperty_Key_t property) {
+        auto ret = _qnn_raw_interface.propertyHasCapability(property);
+
+        const char * status = "Unknown";
+        if (ret == QNN_PROPERTY_SUPPORTED) {
+            status = "Yes";
+        } else if (ret == QNN_PROPERTY_NOT_SUPPORTED) {
+            status = "No";
+        }
+
+        GGMLQNN_LOG_INFO("%s: %s", name, status);
+    };
+
+    GGMLQNN_LOG_INFO("QNN backend properties:");
+    print_property("Create context from binary list", QNN_PROPERTY_CONTEXT_SUPPORT_CREATE_FROM_BINARY_LIST_ASYNC);
+    print_property("Dynamic batch", QNN_PROPERTY_GRAPH_SUPPORT_BATCH_MULTIPLE);
+    print_property("Early termination", QNN_PROPERTY_GRAPH_SUPPORT_EARLY_TERMINATION);
+    print_property("Dynamic dimensions", QNN_PROPERTY_TENSOR_SUPPORT_DYNAMIC_DIMENSIONS);
+    print_property("Blockwise quantization", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCK);
+    print_property("Blockwise quantization with expansion", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION);
+    print_property("Vector quantization", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_VECTOR);
+    print_property("Tensor sparsity", QNN_PROPERTY_TENSOR_SUPPORT_SPARSITY);
+    print_property("Updateable application tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_APP_TENSORS);
+    print_property("Updateable native tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_NATIVE_TENSORS);
+    print_property("Updateable static tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_STATIC_TENSORS);
+    print_property("Qnn group device", QNN_PROPERTY_GROUP_DEVICE);
+}
+
+void qnn_instance::htp_set_memory_grow_size(size_t size) {
+    QnnHtpPerfInfrastructure_MemoryConfig_t grow_size_config = {
+            .option            = QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE,
+            .memGrowSizeConfig = (uint32_t)size,
+    };
+
+    const QnnHtpPerfInfrastructure_MemoryConfig_t *memory_config[] = {
+            &grow_size_config,
+            nullptr,
+    };
+    Qnn_ErrorHandle_t ret = _qnn_htp_perfinfra->setMemoryConfig(_qnn_htp_device_id, _qnn_htp_core_id, memory_config);
+    if (ret != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to set HTP memory config");
+    } else {
+        GGMLQNN_LOG_INFO("succeed to set HTP memory config");
+    }
+}
+
+void qnn_instance::htp_enter_performance_mode() {
+    QnnHtpPerfInfrastructure_PowerConfig_t dcvs_v3_config = {
+            .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3,
+            .dcvsV3Config =
+                    {
+                            .contextId = _qnn_htp_powerconfig_id,
+
+                            .setDcvsEnable = 1,
+                            .dcvsEnable    = 0,
+
+                            .powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE,
+
+                            .setSleepLatency = 1,
+                            .sleepLatency    = 40,
+
+                            .setSleepDisable = 1,
+                            .sleepDisable    = 1,
+
+                            .setBusParams           = 1,
+                            .busVoltageCornerMin    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+                            .busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+                            .busVoltageCornerMax    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+
+                            .setCoreParams           = 1,
+                            .coreVoltageCornerMin    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+                            .coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+                            .coreVoltageCornerMax    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+                    },
+    };
+
+    QnnHtpPerfInfrastructure_PowerConfig_t hmx_config = {
+            .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_V2,
+            .hmxV2Config =
+                    {
+                            .hmxPickDefault         = 0,
+                            .hmxVoltageCornerMin    = DCVS_EXP_VCORNER_MAX,
+                            .hmxVoltageCornerTarget = DCVS_EXP_VCORNER_MAX,
+                            .hmxVoltageCornerMax    = DCVS_EXP_VCORNER_MAX,
+                            .hmxPerfMode            = QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_HIGH,
+                    },
+    };
+
+    QnnHtpPerfInfrastructure_PowerConfig_t rpc_ctrl_config = {
+            .option                  = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY,
+            .rpcControlLatencyConfig = 100,
+    };
+
+    QnnHtpPerfInfrastructure_PowerConfig_t rpc_poll_config = {
+            .option               = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME,
+            .rpcPollingTimeConfig = 9999,
+    };
+
+    const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = {
+            &dcvs_v3_config,
+            &hmx_config,
+            &rpc_ctrl_config,
+            &rpc_poll_config,
+            nullptr,
+    };
+    Qnn_ErrorHandle_t ret = _qnn_htp_perfinfra->setPowerConfig(_qnn_htp_powerconfig_id, power_configs);
+    if (ret != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to set HTP power config");
+    } else {
+        GGMLQNN_LOG_INFO("succeed to set HTP power config");
+    }
+}
+
+void qnn_instance::htp_set_n_hvx_threads(size_t n_threads) {
+    QnnHtpGraph_CustomConfig_t htp_hvx_thread_config = {
+            .option        = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS,
+            .numHvxThreads = n_threads,
+    };
+
+    QnnGraph_Config_t hvx_thread_config = {
+            .option       = QNN_GRAPH_CONFIG_OPTION_CUSTOM,
+            .customConfig = &htp_hvx_thread_config,
+    };
+
+    const QnnGraph_Config_t * graph_configs[] = {&hvx_thread_config, nullptr};
+    Qnn_ErrorHandle_t ret     = _qnn_raw_interface.graphSetConfig(_qnn_graph_handle, graph_configs);
+    if (ret != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to set QNN graph config: set hvx threads %d", n_threads);
+    } else {
+        GGMLQNN_LOG_INFO("succeed to set QNN graph config: set hvx threads %d", n_threads);
+    }
+}
+
 static uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) {
     if (nullptr == instance || nullptr == ggml_tensor || nullptr == qnn_tensor) {
         GGMLQNN_LOG_WARN("invalid params\n");
@@ -2741,7 +3101,7 @@ static uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_t
 }
 
 static void ggmlqnn_print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    //skip sanity check of params
+    //skip sanity check of params because of performance concern
     if (nullptr != func_name && nullptr != ctx) {
         GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
     }
@@ -2760,8 +3120,8 @@ static void ggmlqnn_print_tensors_info(const char * func_name, ggml_backend_qnn_
     GGMLQNN_LOG_DEBUG("\n");
 }
 
-static void dump_op_info(const struct ggml_tensor * tensor) {
-    //skip sanity check of params
+static void ggmlqnn_dump_op_info(const struct ggml_tensor * tensor) {
+    //skip sanity check of params because of performance concern
     const struct ggml_tensor * src0 = tensor->src[0];
     struct ggml_tensor       * src1 = tensor->src[1];
     struct ggml_tensor       * dst  = const_cast<ggml_tensor *>(tensor);
@@ -2780,12 +3140,12 @@ static Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor,
 
     //ensure the tensor name is unique
     if (nullptr == name) {
-        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", get_idx());
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", ggmqnn_get_tensoridx());
     } else {
-        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, get_idx());
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, ggmqnn_get_tensoridx());
     }
-    GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
-    inc_idx();
+    GGMLQNN_LOG_DEBUG("init_tensor %d", ggmqnn_get_tensoridx());
+    ggmqnn_inc_tensoridx();
 
     uint32_t reverse_dims[GGML_MAX_DIMS]    = {};
     uint32_t transpose_dims[GGML_MAX_DIMS]  = {};
@@ -2806,7 +3166,7 @@ static Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor,
     if (b_transpose) {
         GGML_ASSERT(tensor != nullptr); //ensure ggml_tensor is not nullptr for this special case
 
-        get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_n_dims(tensor));
+        ggmlqnn_get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_n_dims(tensor));
         tensor_dims = transpose_dims;
 #if 0
         for (size_t idx = 0; idx < 4; idx++) {
@@ -2890,7 +3250,7 @@ static Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn
 }
 
 // =================================================================================================
-//  section-6: implementation of ggml-qnn backend
+//  section-8: implementation of ggml-qnn backend
 // =================================================================================================
 //TODO: refine this function as it is a performance hotspot/bottleneck function
 static bool ggml_qnn_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * tensor) {
@@ -2935,7 +3295,7 @@ static bool ggml_qnn_can_handle_op(const ggml_backend_qnn_context * ctx, const s
     GGML_UNUSED(ne1);
 
     if (tensor->op == GGML_OP_ADD) {
-        //dump_op_info(tensor);
+        //ggmlqnn_dump_op_info(tensor);
         if (!ggml_are_same_shape(src0, src1)) {
             return false;
         }
@@ -2945,7 +3305,7 @@ static bool ggml_qnn_can_handle_op(const ggml_backend_qnn_context * ctx, const s
     }
 
     if (tensor->op == GGML_OP_MUL_MAT) {
-        //dump_op_info(tensor);
+        //ggmlqnn_dump_op_info(tensor);
         if (src0_rank != src1_rank) // make QNN SDK happy
             return false;
         if (src0_rank < 2) // QNN's limitation, make QNN SDK happy
@@ -2969,7 +3329,9 @@ static bool ggml_qnn_can_handle_op(const ggml_backend_qnn_context * ctx, const s
     }
 
     if (tensor->op == GGML_OP_MUL) {
-        //dump_op_info(tensor);
+        //ggmlqnn_dump_op_info(tensor);
+        if (ctx->device == QNN_BACKEND_NPU)
+            return false;
         if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix
             return false;
         return  (src0->type == GGML_TYPE_F32)
@@ -3296,19 +3658,25 @@ static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, s
     enum ggml_status result         = GGML_STATUS_SUCCESS;
     ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *) backend->context;
     GGML_UNUSED(ctx);
-
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
-        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE
-        || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW
-        || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
-            continue;
-        }
-        bool ok = ggml_qnn_compute_forward(backend, node);
-        if (!ok) {
-            GGMLQNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n",
-                              __func__, node->name, ggml_op_name(node->op));
+    //GGMLQNN_LOG_DEBUG("device %d", ctx->device);
+    //GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
+
+    if (0 == g_inference_approach) {
+        for (int i = 0; i < cgraph->n_nodes; i++) {
+            ggml_tensor * node = cgraph->nodes[i];
+            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE
+                || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW
+                || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+                continue;
+            }
+            bool ok = ggml_qnn_compute_forward(backend, node);
+            if (!ok) {
+                GGMLQNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+            }
         }
+    } else {
+        //offload entire cgraph to QNN CPU & GPU & NPU
+        return ggmlqnn_graph_compute(backend, cgraph);
     }
 
     return result;
@@ -3331,8 +3699,8 @@ static const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t d
         return "unknown";
     }
     if (0 == strncmp(ctx->name, "qnn-npu", 7)) {
-        const char * soc_info = qnn_get_socmodel_desc(ctx->socinfo.soc_model);
-        const char * htp_arch = qnn_get_htparch_desc(ctx->socinfo.htp_arch);
+        const char * soc_info = ggmlqnn_get_socmodel_desc(ctx->socinfo.soc_model);
+        const char * htp_arch = ggmlqnn_get_htparch_desc(ctx->socinfo.htp_arch);
         std::string dev_desc = std::string(ctx->desc)
                 + std::string(soc_info) + "_" + std::string(htp_arch)
                 + "," + std::string(ctx->socinfo.soc_desc);
@@ -3353,12 +3721,12 @@ static void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *
     }
 
     if (QNN_BACKEND_CPU == ctx->device || QNN_BACKEND_GGML == ctx->device) {
-        *total = get_system_total_memory_in_bytes();
-        *free = get_system_free_memory_in_bytes();
+        *total = ggmlqnn_get_system_total_memory_in_bytes();
+        *free = ggmlqnn_get_system_free_memory_in_bytes();
     } else if (QNN_BACKEND_GPU == ctx->device) {
         //TODO: probe GPU info in Qualcomm Adreno GPU
-        *total = get_system_total_memory_in_bytes();
-        *free = get_system_free_memory_in_bytes();
+        *total = ggmlqnn_get_system_total_memory_in_bytes();
+        *free = ggmlqnn_get_system_free_memory_in_bytes();
     } else if (QNN_BACKEND_NPU == ctx->device) {
         size_t rpc_ion_memsize = ctx->instance->get_rpcmem_capacity();
         size_t rpc_ion_usage = ctx->instance->get_rpcmem_usage();
@@ -3370,8 +3738,15 @@ static void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *
 }
 
 static enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) {
-    GGML_UNUSED(dev);
-    return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+    struct ggml_backend_qnn_context * ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
+    if (QNN_BACKEND_CPU == ctx->device)
+        return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+    else if (QNN_BACKEND_GPU == ctx->device)
+        return GGML_BACKEND_DEVICE_TYPE_GPU;
+    else if (QNN_BACKEND_NPU == ctx->device)
+        return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+    else
+        return GGML_BACKEND_DEVICE_TYPE_CPU;
 }
 
 static void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev,
@@ -3605,6 +3980,8 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
         return g_qnn_mgr[device].backend;
     }
 
+    ggmlqnn_load_cfg();
+
 #if defined(__ANDROID__)
     std::string path = qnn_lib_path;
     if (QNN_BACKEND_NPU == device) {
@@ -3671,7 +4048,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
 GGML_BACKEND_DL_IMPL(ggml_backend_qnn_reg)
 
 // =================================================================================================
-//  section-7: offload GGML op to QNN backend
+//  section-9: general approach: offload GGML op to QNN backend
 // =================================================================================================
 static inline uint32_t ggmlqnn_get_tensor_data_size(const ggml_tensor * tensor) {
     /*
@@ -3702,13 +4079,6 @@ static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const
     return true;
 }
 
-#define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                          \
-    do {                                                                    \
-        if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {       \
-            return;                                                         \
-        }                                                                   \
-    } while (0)
-
 /*
  * provide a general skeleton to offload ggml op to QNN backend: a single node contains 2 input
  * tensor and 1 output tensor
@@ -3750,10 +4120,10 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         p_tensor1               = tensor[1];
         p_tensor2               = tensor[2];
     } else {
-        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
+        GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str());
         GGML_ASSERT(instance->get_device_id() == ctx->device);
         //create QNN graph
-        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8);
+        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8, 4);
         if (QNN_SUCCESS != error) {
             GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
             return;
@@ -3844,10 +4214,7 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
 
 /*
  * this function is AI-assisted code from Grok 3 for purpose of offload 4d matrix mulmat to QNN backend
- * UT in ggml-qnn-ut.cpp passed:
- * ./scripts/build-run-android.sh run_ut_mulmat 0
- * ./scripts/build-run-android.sh run_ut_mulmat 1
- * ./scripts/build-run-android.sh run_ut_mulmat 2
+ * various UT has verified and succeed but failed in CT of test-backend-ops
  *
  * the logic of ggml_qnn_mul_mat_4d is similar to ggml_qnn_mul_mat but much more complicated
  * than ggml_qnn_mul_mat, so it's a standalone function.
@@ -4130,12 +4497,13 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         p_tensor2_transpose     = tensors[4];
     } else {
         //create QNN graph
-        GGMLQNN_LOG_DEBUG("graph name %s", graph_name.c_str());
-        error = qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), nullptr, &graph_handle);
+        GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str());
+        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8, 4);
         if (QNN_SUCCESS != error) {
             GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
             return;
         }
+        graph_handle = instance->get_qnn_graph_handle();
 
         //create computational tensor
         p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
@@ -4335,3 +4703,65 @@ void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
+
+// =================================================================================================
+//  section-10: second approach: mapping ggml computational cgraph to QNN graph
+// =================================================================================================
+// details: https://github.com/ggml-org/llama.cpp/pull/12326#issuecomment-2712838649
+// ref:     https://github.com/kantv-ai/kantv/blob/kantv-poc-with-qnn/core/ggml/jni/Inception_v3.cpp#L20634
+// TODO: mapping entire ggml cgraph to a single QNN graph
+static enum ggml_status ggmlqnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    enum ggml_status ggml_result                = GGML_STATUS_SUCCESS;
+    Qnn_ErrorHandle_t qnn_error                 = QNN_SUCCESS;
+    qnn_perf op_perf                            = qnn_perf("ggmlqnn_graph_compute");
+    qnn_instance * instance                     = nullptr;
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *) backend->context;
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+    op_perf.start();
+
+    //now we got the entire ggml cgraph
+    GGMLQNN_LOG_DEBUG("qnn device %d(%s)", ctx->device, ggml_backend_qnn_get_devname(ctx->device));
+    GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
+    int num_nodes = std::min(5, cgraph->n_nodes);
+    //for (int i = 0; i < cgraph->n_nodes; i++) {
+    for (int i = 0; i < num_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+        GGMLQNN_LOG_DEBUG("%s: op %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+    }
+
+    //now we'll offload the entire ggml cgraph to a single opcfg QNN graph
+    std::string graph_name;
+    ggmlqnn_get_graphkey_from_cgraph(cgraph, graph_name);
+    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+        //retrieve computational resource from cached QNN graph
+        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
+        graph_handle            = std::get<0>(graph_item);
+    } else {
+        //create QNN graph
+        GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str());
+        qnn_error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8, 4);
+        if (QNN_SUCCESS != qnn_error) {
+            GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d(%s)\n", graph_name.c_str(), qnn_error,
+                             ggmlqnn_get_qnnerror_string(qnn_error));
+            return ggml_result;
+        }
+        graph_handle = instance->get_qnn_graph_handle();
+        //TODO: compose a single opcfg QNN graph
+
+        //TODO: finalize QNN graph
+        //CHECK_QNN_API(qnn_error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+
+        //cache QNN graph
+        qnn_tensors_t ggml_op_mulmat_tensors;
+        ggml_op_mulmat_tensors.reserve(0);
+        auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+        instance->_qnn_graph_map[graph_name] = graph_item;
+    }
+    //exec QNN graph
+
+    GGMLQNN_LOG_DEBUG("the second inference approach \"mapping cgraph to QNN graph\" is actually not supported now");
+
+    return ggml_result;
+}
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 5b5e55aa2f7b6..393f4d458f41b 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -14,7 +14,9 @@ GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
 #https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
 #https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
 QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
-QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.31.0.250130/
+QNN_SDK_INSTALL_PATH=/opt/qcom/aistack/qairt/
+QNN_SDK_VERSION=2.32.0.250228
+QNN_SDK_PATH=${QNN_SDK_INSTALL_PATH}/${QNN_SDK_VERSION}
 
 #default is QNN NPU
 qnnbackend=2
@@ -32,11 +34,35 @@ function show_pwd()
 }
 
 
-function check_qnn_sdk()
+function check_and_download_qnn_sdk()
 {
+    is_qnn_sdk_exist=1
+
     if [ ! -d ${QNN_SDK_PATH} ]; then
-        echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, pls check or download it from ${QNN_SDK_URL}...\n"
-        exit 1
+        echo -e "QNN_SDK_PATH ${QNN_SDK_PATH} not exist, download it from ${QNN_SDK_URL}...\n"
+        is_qnn_sdk_exist=0
+    fi
+
+    if [ ! -f ${QNN_SDK_PATH}/sdk.yaml ]; then
+        is_qnn_sdk_exist=0
+    fi
+
+    if [ ${is_qnn_sdk_exist} -eq 0 ]; then
+        echo "sudo mkdir -p ${QNN_SDK_INSTALL_PATH}"
+        sudo mkdir -p ${QNN_SDK_INSTALL_PATH}
+        if [ ! -f v${QNN_SDK_VERSION}.zip ]; then
+            wget --no-config --quiet --show-progress -O v${QNN_SDK_VERSION}.zip https://softwarecenter.qualcomm.com/api/download/software/sdks/Qualcomm_AI_Runtime_Community/All/${QNN_SDK_VERSION}/v${QNN_SDK_VERSION}.zip
+        fi
+        unzip v${QNN_SDK_VERSION}.zip
+        if [ $? -ne 0 ]; then
+            printf "failed to download Qualcomm QNN SDK to %s \n" "${QNN_SDK_PATH}"
+            exit 1
+        fi
+        sudo mv qairt/${QNN_SDK_VERSION} ${QNN_SDK_INSTALL_PATH}/
+        printf "Qualcomm QNN SDK saved to ${QNN_SDK_PATH} \n\n"
+        sudo rm -rf qairt
+    else
+        printf "Qualcomm QNN SDK already exist:${QNN_SDK_PATH} \n\n"
     fi
 }
 
@@ -75,7 +101,7 @@ function check_and_download_ndk()
 
 function build_arm64
 {
-    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_USE_QNN=ON -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
+    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
     cd out/android
     make -j16
     show_pwd
@@ -97,11 +123,14 @@ function check_qnn_libs()
 {
     #reuse the cached qnn libs on Android phone
     adb shell ls ${REMOTE_PATH}/libQnnCpu.so
+    adb shell ls ${REMOTE_PATH}/libQnnGpu.so
+    adb shell ls ${REMOTE_PATH}/libQnnHtp.so
     if [ $? -eq 0 ]; then
         printf "QNN libs already exist on Android phone\n"
     else
         update_qnn_libs
     fi
+    update_qnn_cfg
 }
 
 
@@ -119,11 +148,17 @@ function update_qnn_libs()
 }
 
 
+function update_qnn_cfg()
+{
+    adb push ./scripts/ggml-qnn.cfg ${REMOTE_PATH}/
+}
+
+
 function build_ggml_qnn()
 {
     show_pwd
     check_and_download_ndk
-    check_qnn_sdk
+    check_and_download_qnn_sdk
     dump_vars
     remove_temp_dir
     build_arm64
@@ -140,21 +175,20 @@ function prepare_run_on_phone()
 
     check_qnn_libs
 
-    if [ -f ./out/android/bin/libggml-qnn.so ]; then
+    if [ -f ./out/android/bin/libggml-cpu.so ]; then
         adb push ./out/android/bin/*.so ${REMOTE_PATH}/
     fi
     adb push ./out/android/bin/${program} ${REMOTE_PATH}/
     adb shell chmod +x ${REMOTE_PATH}/${program}
 }
 
-
 function run_llamacli()
 {
     prepare_run_on_phone llama-cli
 
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
+               && ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -ngl 99 -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
 
 }
 
@@ -213,7 +247,6 @@ function run_test-op()
 
 }
 
-
 function print_oplist()
 {
 oplist="DUP
@@ -302,7 +335,7 @@ function show_usage()
     echo "  $0 build"
     echo "  $0 updateqnnlib"
     echo "  $0 run_testops"
-    echo "  $0 run_testop          [ADD/MUL/MUL_MAT/...(op from print_oplist)]  [0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU)]"
+    echo "  $0 run_testop          [ADD/MUL/MUL_MAT......(op from print_oplist)]  [0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU)]"
     echo "  $0 run_llamacli        0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
     echo "  $0 run_llamabench      0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
 
@@ -312,7 +345,8 @@ function show_usage()
 
 show_pwd
 
-check_qnn_sdk
+check_and_download_ndk
+check_and_download_qnn_sdk
 
 if [ $# == 0 ]; then
     show_usage
@@ -343,20 +377,22 @@ elif [ $# == 1 ]; then
     fi
 elif [ $# == 2 ]; then
     qnnbackend=$2
+    if [ ${qnnbackend} -gt 3 ]; then
+        show_usage
+        exit 1
+    fi
+
     if [ "$1" == "run_llamacli" ]; then
         run_llamacli
         exit 0
     elif [ "$1" == "run_llamabench" ]; then
         run_llamabench
         exit 0
-        exit 0
-    else
-        show_usage
-        exit 1
     fi
 elif [ $# == 3 ]; then
-    #opname can be found via print_oplist:
     opname=$2
+#TODO: check opname in oplist
+#opname can be found via print_oplist:
 
     qnnbackend=$3
     if [ ${qnnbackend} -gt 3 ]; then
diff --git a/scripts/ggml-qnn.cfg b/scripts/ggml-qnn.cfg
new file mode 100644
index 0000000000000..5796e613ff2af
--- /dev/null
+++ b/scripts/ggml-qnn.cfg
@@ -0,0 +1,9 @@
+[general]
+# enable/disable QNN's internal log
+print_qnn_internal_log = 0
+# 0: general approach,similar to ggml-sycl or ggml-cann
+# 1: mapping entire ggml cgraph to QNN graph
+inference_approach = 0
+
+[npu]
+npu_inference_datatype = "fp16"
diff --git a/tests/ggml-qnn-ut.cpp b/tests/ggml-qnn-ut.cpp
deleted file mode 100644
index 75d941263b82c..0000000000000
--- a/tests/ggml-qnn-ut.cpp
+++ /dev/null
@@ -1,480 +0,0 @@
-/*
- * Copyright (c) 2023-2024 The ggml authors
- *
- * implementation of self-made Android command line tool for verify ggml-qnn backend
- * this file will help you to understand fundamental principle of ggml and ggml-qnn backend
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-#include <stdio.h>
-#include <stdlib.h>
-#include <stdint.h>
-#include <stdarg.h>
-#include <string.h>
-#include <stddef.h>
-#include <inttypes.h>
-#if defined(__ANDROID__) || defined(__linux__)
-#include <unistd.h>
-#include <math.h>
-#include <time.h>
-#include <unistd.h>
-#include <dlfcn.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <limits.h>
-#include <signal.h>
-#include <fcntl.h>
-#include <sys/types.h>
-#endif
-
-#include <string>
-#include <vector>
-#include <thread>
-#include <mutex>
-#include <map>
-#include <set>
-#include <tuple>
-#include <queue>
-#include <fstream>
-#include <iostream>
-#include <iomanip>
-#include <sstream>
-#include <chrono>
-#include <memory>
-#include <regex>
-#include <random>
-#include <functional>
-#include <unordered_map>
-#include <condition_variable>
-#include <cassert>
-#include <unordered_set>
-#include <utility>
-
-#include "ggml.h"
-#include "ggml-cpu.h"
-#include "ggml-alloc.h"
-#include "ggml-backend.h"
-#include "ggml-qnn.h"
-
-static void tensor_dump(const ggml_tensor * tensor, const char * name);
-
-#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor)
-
-static bool ggml_graph_compute_helper(
-        struct ggml_backend * backend,
-        struct ggml_cgraph * graph,
-        std::vector<uint8_t> & buf,
-        int n_threads,
-        ggml_abort_callback abort_callback,
-        void * abort_callback_data) {
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, NULL);
-
-    plan.abort_callback = abort_callback;
-    plan.abort_callback_data = abort_callback_data;
-
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.data();
-    }
-
-    if (nullptr != backend)
-        return ggml_backend_graph_compute(backend, graph) == GGML_STATUS_SUCCESS;
-    else
-        return ggml_graph_compute(graph, &plan);
-}
-
-
-static void tensor_dump_elements(const ggml_tensor * tensor) {
-    float value = 0;
-    std::ostringstream tmposs;
-    if (tensor->type == GGML_TYPE_F32) {
-        for (int h = 0; h < tensor->ne[3]; h++) {
-            for (int i = 0; i < tensor->ne[2]; i++) {
-                for (int j = 0; j < tensor->ne[1]; j++) {
-                    for (int k = 0; k < tensor->ne[0]; k++) {
-                        value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] +
-                                                         j * tensor->ne[0] + k];
-                        tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value
-                               << " ";
-                    }
-                    if (strlen(tmposs.str().c_str()) <= (4096 - 96)) {
-                        printf("%s\n", tmposs.str().c_str());
-                    }
-                    tmposs.clear();
-                    tmposs.str("");
-                }
-            }
-        }
-    }
-
-    printf("\n");
-}
-
-
-static void tensor_dump(const ggml_tensor * tensor, const char * name) {
-    printf("dump ggml tensor %s(%s)\n", name, tensor->name);
-    printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n",
-          name,
-          tensor->type, ggml_type_name(tensor->type),
-          tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
-          tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[2]);
-    tensor_dump_elements(tensor);
-
-    printf("\n");
-}
-
-
-static uint32_t get_tensor_rank(const ggml_tensor * tensor) {
-    uint32_t rank = 0;
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
-        if ((0 != tensor->ne[i]) && (1 != tensor->ne[i])) {
-            rank++;
-        }
-    }
-    return rank;
-}
-
-
-static uint32_t get_tensor_data_size(const ggml_tensor * tensor) {
-    return ggml_nbytes(tensor);
-}
-
-
-//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L20
-static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float max = 1.0f) {
-    // static RNG initialization (revisit if n_threads stops being constant)
-    static const size_t n_threads = std::thread::hardware_concurrency();
-    static std::vector<std::default_random_engine> generators = []() {
-        std::random_device rd;
-        std::vector<std::default_random_engine> vec;
-        vec.reserve(n_threads);
-        //for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(1234 + i); } // fixed seed
-        for (size_t i = 0; i < n_threads; i++) { vec.emplace_back(rd()); }
-        return vec;
-    }();
-
-    size_t size = ggml_nelements(tensor);
-    std::vector<float> data(size);
-
-    auto init_thread = [&](size_t ith, size_t start, size_t end) {
-        std::uniform_real_distribution<float> distribution(min, max);
-        for (size_t i = start; i < end; i++) {
-            data[i] = distribution(generators[ith]);
-        }
-    };
-
-    std::vector<std::thread> threads;
-    threads.reserve(n_threads);
-    for (size_t i = 0; i < n_threads; i++) {
-        size_t start =     i*size/n_threads;
-        size_t end   = (i+1)*size/n_threads;
-        threads.emplace_back(init_thread, i, start, end);
-    }
-    for (auto & t : threads) {
-        t.join();
-    }
-    if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
-        ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
-    } else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
-        GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
-        std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
-        std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
-        const float * im = imatrix.data();
-        if (!ggml_quantize_requires_imatrix(tensor->type)) {
-            // when the imatrix is optional, we want to test both quantization with and without imatrix
-            // use one of the random numbers to decide
-            if (data[0] > 0.5f*(min + max)) {
-                im = nullptr;
-            }
-        }
-        ggml_quantize_chunk(tensor->type, data.data(), dataq.data(), 0, size/tensor->ne[0], tensor->ne[0], im);
-        GGML_ASSERT(ggml_validate_row_data(tensor->type, dataq.data(), dataq.size()));
-        ggml_backend_tensor_set(tensor, dataq.data(), 0, dataq.size());
-    } else if (tensor->type == GGML_TYPE_I8 || tensor->type == GGML_TYPE_I16 || tensor->type == GGML_TYPE_I32) {
-        // This is going to create some weird integers though.
-        ggml_backend_tensor_set(tensor, data.data(), 0, ggml_nbytes(tensor));
-    } else {
-        GGML_ASSERT(false);
-    }
-}
-
-
-//ref: https://github.com/ggerganov/llama.cpp/blob/master/tests/test-backend-ops.cpp#L310
-static void initialize_tensors(ggml_context * ctx) {
-    for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
-        init_tensor_uniform(t);
-    }
-}
-
-
-static void show_usage() {
-    printf(" " \
-        "\nUsage: ggml-qnn-ut [options]\n" \
-        "\n" \
-        "Options:\n" \
-        " -t GGML_OP_ADD / GGML_OP_MUL / GGML_OP_MULMAT\n" \
-        " -b 0(QNN_CPU) 1(QNN_GPU) 2(QNN_NPU) 3(QNN_GGML)\n" \
-        " ?/h print usage information\n\n"
-    );
-}
-
-
-int main(int argc, char * argv[]) {
-    int64_t n_begin_time        = 0LL;
-    int64_t n_end_time          = 0LL;
-    int64_t n_duration          = 0LL;
-    size_t  ctx_size            = 0;
-    int     sizey               = 4;
-    int     sizex               = 4;
-    int num_threads             = 4;
-    int n_backend_type          = QNN_BACKEND_CPU;
-    int n_ggml_op_type          = GGML_OP_ADD;
-
-    struct ggml_context * ctx   = nullptr;
-    struct ggml_cgraph  * gf    = nullptr;
-    struct ggml_tensor  * src0  = nullptr;
-    struct ggml_tensor  * src1  = nullptr;
-    struct ggml_tensor  * dst   = nullptr;
-    ggml_backend_t backend      = nullptr;
-    ggml_backend_buffer_t buffer= nullptr;
-    ggml_type qtype             = GGML_TYPE_F32;
-    //ggml_type qtype             = GGML_TYPE_Q4_0;
-    std::vector<uint8_t> work_buffer;
-
-    for (int i = 1; i < argc; i++) {
-        if (0 == strcmp(argv[i], "-t")) {
-            if (i + 1 < argc) {
-                if (0 == memcmp(argv[i + 1], "GGML_OP_ADD", 11)) {
-                    n_ggml_op_type = GGML_OP_ADD;
-                } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL_MAT", 15)) {
-                    n_ggml_op_type = GGML_OP_MUL_MAT;
-                } else if (0 == memcmp(argv[i + 1], "GGML_OP_MUL", 11)) {
-                    n_ggml_op_type = GGML_OP_MUL;
-                } else {
-                    show_usage();
-                    return 1;
-                }
-                i++;
-            }
-        } else if (0 == strcmp(argv[i], "-b")) {
-            if (i + 1 < argc) {
-                int backend = atoi(argv[i + 1]);
-                if (backend <= QNN_BACKEND_GGML)
-                    n_backend_type     = backend;
-                else {
-                    show_usage();
-                    return 1;
-                }
-                i++;
-            }
-        } else {
-            show_usage();
-            return 1;
-        }
-    }
-
-    printf("Testing %zu devices\n\n", ggml_backend_dev_count());
-    for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
-        ggml_backend_dev_t dev = ggml_backend_dev_get(i);
-
-        printf("Backend %zu/%zu: %s\n", i + 1, ggml_backend_dev_count(),
-               ggml_backend_dev_name(dev));
-
-        if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
-            printf("  Skipping CPU backend\n");
-            continue;
-        }
-
-        backend = ggml_backend_dev_init(dev, reinterpret_cast<const char *>(i));
-        GGML_ASSERT(backend != NULL);
-        if (backend != nullptr) {
-            printf("%s: initialize %s backend\n", __func__, ggml_backend_dev_name(dev));
-        }
-
-        printf("  Device description: %s\n", ggml_backend_dev_description(dev));
-        size_t free, total;
-        ggml_backend_dev_memory(dev, &free, &total);
-        printf("  Device memory: %zu MB (%zu MB free)\n", total / 1024 / 1024, free / 1024 / 1024);
-        printf("\n");
-    }
-
-    ggml_backend_t backend_cpu = nullptr;
-    backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
-    if (nullptr == backend_cpu) {
-        printf("failed to initialize cpu backend\n");
-        exit(1);
-    } else {
-        printf("succeed to initialize cpu backend\n");
-    }
-
-    printf("ggml op:%d(%s)", n_ggml_op_type, ggml_op_name((enum ggml_op) n_ggml_op_type));
-
-    n_begin_time = ggml_time_us();
-    srand(time(NULL));
-
-    ctx_size += 1024 * 1024 * 32;
-    printf("Allocating Memory of size %zi bytes, %zi MB\n", ctx_size,
-                    (ctx_size / 1024 / 1024));
-
-    struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ NULL,
-            /* no_alloc   =*/ 0
-    };
-
-    if (n_backend_type != QNN_BACKEND_GGML) {
-        params.no_alloc = true;
-    }
-
-    ctx = ggml_init(params);
-    if (!ctx) {
-        printf("ggml_init() failed\n");
-        return 2;
-    }
-
-    printf("creating new tensors\n");
-    printf("ggml_blck_size(%s) %ld\n", ggml_type_name(qtype), ggml_blck_size(qtype));
-    printf("ggml_type_size(%s) %ld\n", ggml_type_name(qtype), ggml_type_size(qtype));
-    if (qtype != GGML_TYPE_F32) {
-        sizex = ggml_blck_size(qtype);
-    }
-
-    if (n_ggml_op_type == GGML_OP_ADD) {
-        src0 = ggml_new_tensor_2d(ctx, qtype, sizey, sizex);
-        src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizey, sizex);
-    } else {
-        //verify 2D matrix
-        //src0 = ggml_new_tensor_2d(ctx, qtype, 128, 64);
-        //src1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 128, 2);
-        //verify 3D matrix
-        //src0 = ggml_new_tensor_3d(ctx, qtype, 128, 64, 8);
-        //src1 = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 2, 8);
-        //verify 4D matrix
-#if 1   //ok
-        src0 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
-        src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 1, 6, 4);
-#else   //ok
-        src0 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
-        src1 = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, 256, 16, 3, 2);
-#endif
-    }
-
-    ggml_set_input(src0);
-    ggml_set_input(src1);
-    switch (n_ggml_op_type) {
-        case GGML_OP_ADD:
-            dst = ggml_add(ctx, src0, src1);
-            break;
-        case GGML_OP_MUL:
-            dst = ggml_mul(ctx, src0, src1);
-            break;
-        case GGML_OP_MUL_MAT:
-            dst = ggml_mul_mat(ctx, src0, src1);
-            break;
-        default:
-            printf("ggml op %d(%s) not supported", n_ggml_op_type,
-                  ggml_op_name((enum ggml_op) n_ggml_op_type));
-            ggml_free(ctx);
-            ggml_backend_free(backend);
-            return 3;
-    }
-
-    ggml_set_output(dst);
-
-#ifdef GGML_USE_QNN
-    if (n_backend_type != QNN_BACKEND_GGML) {
-        printf("init QNN backend %d\n", n_backend_type);
-        //re-init again
-        backend = ggml_backend_qnn_init(n_backend_type, "/data/local/tmp/");
-        if (nullptr == backend) {
-            printf("create qnn backend %d(%s) failed\n", n_backend_type, ggml_backend_qnn_get_devname(n_backend_type));
-            return 1;
-        } else {
-            printf("create qnn backend %d(%s) succeed\n", n_backend_type, ggml_backend_qnn_get_devname(n_backend_type));
-        }
-
-        //buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
-        ggml_backend_buffer_type_t buft = ggml_backend_get_default_buffer_type(backend);
-        buffer = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
-        if (!buffer) {
-            printf("%s: failed to allocate backend buffer\n", __func__);
-            ggml_free(ctx);
-            ggml_backend_free(backend);
-            return 4;
-        }
-    } else {
-        printf("init default cpu backend\n");
-        backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
-    }
-#endif
-
-    printf("creating compute graph\n");
-    gf = ggml_new_graph(ctx);
-    ggml_build_forward_expand(gf, dst);
-
-    if (qtype == GGML_TYPE_F32) {
-        if (n_backend_type != QNN_BACKEND_GGML) {
-            initialize_tensors(ctx);
-        } else {
-            ggml_set_f32(src0, (rand() % 100 + 1));
-            ggml_set_f32(src1, (rand() % 100 + 1));
-            ggml_set_f32(dst, 0.0f);
-        }
-        //for compare compute result between cpu backend and QNN backend
-        ggml_set_f32(src0, 1.0f);
-        ggml_set_f32(src1, 2.0f);
-        ggml_set_f32(dst, 0.0f);
-    } else {
-        initialize_tensors(ctx);
-    }
-
-    ggml_graph_compute_helper(backend, gf, work_buffer, num_threads, nullptr, nullptr);
-    if (get_tensor_data_size(dst) < (100 * 100)) {
-        printf("dump result tensors:\n");
-        TENSOR_DUMP(src0);
-        TENSOR_DUMP(src1);
-        TENSOR_DUMP(dst);
-    } else {
-        printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-              src0->name,
-              src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
-              src0->nb[0], src0->nb[1], src0->nb[2]);
-        printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-              src1->name,
-              src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
-              src1->nb[0], src1->nb[1], src1->nb[2]);
-        printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
-              dst->name,
-              dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
-              dst->nb[1], dst->nb[2]);
-    }
-    //TENSOR_DUMP(dst);
-
-    ggml_free(ctx);
-    ggml_backend_buffer_free(buffer);
-    ggml_backend_free(backend);
-
-    n_end_time = ggml_time_us();
-    n_duration = (n_end_time - n_begin_time) / 1000;
-#ifdef GGML_USE_QNN
-    printf("duration of ut GGML_OP_%s using QNN backend %s: %ld milliseconds\n", ggml_op_name((enum ggml_op)n_ggml_op_type), ggml_backend_qnn_get_devname(n_backend_type), n_duration);
-#endif
-
-    return 0;
-}

From c3fd461f70346d9c8a4813324ff7e10f7dfc1a57 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 18 Mar 2025 22:17:00 +0800
Subject: [PATCH 125/200] ggml-qnn: rebase to upstream

---
 CMakeLists.txt                 |   11 +
 ggml/src/ggml-qnn/ggml-qnn.cpp | 2311 ++++++++++++++++++--------------
 scripts/build-run-android.sh   |   76 +-
 scripts/ggml-qnn.cfg           |   21 +-
 4 files changed, 1355 insertions(+), 1064 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index de51c0a17b2f6..2148b436d2afc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,6 +7,16 @@ set(CMAKE_WARN_UNUSED_CLI YES)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+    set(TARGET_SNAPDRAGON8GEN3    ON)
+    if(TARGET_SNAPDRAGON8GEN3)
+       #works fine on Snapdragon 8Gen3 with 1.5x(45+ tokens/second)-3x(70+ tokens/second) performance gain through the default ggml backend
+       add_definitions(-march=armv8.7-a)
+       add_definitions(-mcpu=cortex-x1)
+       add_definitions(-mtune=cortex-x1)
+   endif()
+endif()
+
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
@@ -119,6 +129,7 @@ llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
 llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)
+llama_option_depr(WARNING     LLAMA_QNN                 GGML_QNN)
 
 if (NOT MSVC)
     if (LLAMA_SANITIZE_THREAD)
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 7c3477094ea9f..834af1e08e30f 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -14,13 +14,16 @@
  * section-6  QNN helper function
  * section-7  ggml-qnn backend helper function / class
  * section-8  implementation of ggml-qnn backend according to ggml's backend subsystem
- * section-9  implementation of offload ggml op to QNN backend
- * section-10 illustrate why the second approach is actual an fake at the moment
+ * section-9  implementation of general approach or the first tech approach
+ * section-10 implementation of the second tech approach:mapping the entire ggml cgraph to a single QNN graph
  *
  * currently provide following ggml op' QNN backend implementation:
- * - GGML_OP_ADD:    this is a simple skeleton, can expand other ggml ops according to expertise
- * - GGML_OP_MUL:    this is a simple skeleton, can expand other ggml ops according to expertise
- * - GGML_OP_MUL_MAT:this is a complicated skeleton, can expand other complex ggml ops accordingly
+ * - GGML_OP_ADD/GGML_OP_SUB/GGML_OP_MUL/GGML_OP_DIV:
+ *   this is a simple skeleton, can expand other ggml ops according to expertise
+ * - GGML_OP_LOG/GGML_OP_SQRT:
+ *   this is a simple skeleton, can expand other ggml ops according to expertise
+ * - GGML_OP_MUL_MAT:
+ *   this is a complicated skeleton, can expand other complex ggml ops accordingly
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
@@ -108,90 +111,70 @@
 #include "ggml-backend-impl.h"
 
 // =================================================================================================
-//  section-1: forward/prototype declaration
+//  section-1: forward/prototype declaration, macro
 // =================================================================================================
 class  qnn_instance;
+struct qnn_parameter;
 struct ggml_backend_qnn_context;
-typedef void (* ggmlqnn_op_func_t)(ggml_backend_qnn_context * ctx, ggml_tensor * op);
-static Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
-                                             Qnn_TensorType_t qnn_tensor_type,
-                                             Qnn_DataType_t qnn_data_type,
-                                             uint32_t rank, uint32_t * dims,
-                                             void * data, uint32_t data_size,
-                                             bool b_transpose = false);
-static enum ggml_status ggmlqnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
-static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-
-//op functions:
-//done
-static void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-//todo
-static void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value);
 
-// =================================================================================================
-//  section-2: global var, macro, data structure
-// =================================================================================================
-// the following two vars can be fetched from [qnn_runtimelib_path]/ggml-qnn.cfg
-// [general]
-// print_qnn_internal_log=0
-// inference_approach=0
-static int g_print_qnn_internal_log         = 0; // enable/disable QNN's internal log
-static int g_inference_approach             = 0; // 0: general approach,similar to ggml-sycl or ggml-cann 1: mapping entire ggml cgraph to QNN graph
-static const char * g_qnn_cfgfilename       = "ggml-qnn.cfg";
+typedef void (* ggmlqnn_op_func_t)(ggml_backend_qnn_context * ctx, ggml_tensor * op);
 
-#if defined(__ANDROID__)
-//Android command line program
-static const char * g_qnn_runtimelib_path   = "/data/local/tmp/";
-#elif defined(__linux__)
-static const char * g_qnn_runtimelib_path   = "/tmp/";
-#elif defined(_WIN32)
-static const char * g_qnn_runtimelib_path   = "C:\\";
-#endif
-
-#if !defined(__ANDROID__) && !defined(__linux__)
-static std::atomic<int32_t> g_ggmltensor_idx(0); //ensure every QNN tensor name is unique
-#else
-static int32_t g_ggmltensor_idx = 0; //ensure every QNN tensor name is unique
-#endif
+//general function prototypes for ggml-qnn backend
+static void             ggmlqnn_dump_tensor(const ggml_tensor * tensor, const char * name);
+static enum ggml_status ggmlqnn_backend_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+static void             ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
+static inline bool      ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
+static Qnn_Tensor_t *   ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
+                                                     const ggml_tensor * tensor, const char * name,
+                                                     Qnn_TensorType_t qnn_tensor_type,
+                                                     Qnn_DataType_t qnn_data_type,
+                                                     uint32_t rank, uint32_t * dims,
+                                                     void * data, uint32_t data_size,
+                                                     bool b_transpose = false);
+
+//function prototypes for all op functions in the first tech approach(general approach in other backends)
+//general op function for elment-wise operation on 1/2 input tensors and 1 output tensor
+static void ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+
+//todo by AI experts
+static void ggmlqnn_compute_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void ggmlqnn_compute_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value);
+
+//function prototypes for all op functions in the second tech approach("mapping the entire cgraph to a single QNN graph")
+static void ggmlqnn_graph_addnode(ggml_backend_qnn_context * ctx, struct ggml_cgraph * cgraph,
+                Qnn_GraphHandle_t graph_handle, std::string & graph_name, ggml_tensor * op, bool is_reuse_graph = false);
 
 #if 0//def NDEBUG
 #define GGMLQNN_DEBUG                                   0
-#define ENABLE_QNNBACKEND_PERF                          0
-#define GGMLQNN_PRINT_OP_ADD_LOG                        0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
-#define GGMLQNN_PRINT_OP_MUL_MAT_LOG                    0
 #else
-#define GGMLQNN_DEBUG                                   1  // for troubleshooting QNN backend
-#define ENABLE_QNNBACKEND_PERF                          0
-#define GGMLQNN_PRINT_OP_ADD_LOG                        0  // GGML_OP_ADD already verified with QNN-CPU / QNN-GPU / QNN-NPU
-#define GGMLQNN_PRINT_OP_MUL_MAT_LOG                    1
+#define GGMLQNN_DEBUG                                   1
 #endif
+
 #define GGML_QNN_LOGBUF_LEN                             4096
+#define GGML_QNN_TMPBUF_LEN                             256
 
 #define GGMLQNN_LOG_ERROR(...)                          ggmlqnn_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #define GGMLQNN_LOG_WARN(...)                           ggmlqnn_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
@@ -202,10 +185,10 @@ static int32_t g_ggmltensor_idx = 0; //ensure every QNN tensor name is unique
 #else
 #define GGMLQNN_LOG_DEBUG(...)
 #endif
+#define GGMLQNN_DUMP_TENSOR(tensor)                     ggmlqnn_dump_tensor(tensor, #tensor)
 
 #define GGMLQNN_MEM_ADD(alignment)                      (sizeof (size_t) + alignment)
 #define GGMLQNN_MEM_MASK(alignment)                     ((uintptr_t)alignment - 1)
-#define GQCGT                                           ggmlqnn_create_general_tensor
 #define QNN_VER_PTR(x)                                  (&((x).v1))
 #define RPCMEM_DEFAULT_FLAGS                            1
 #define RPCMEM_HEAP_ID_SYSTEM                           25
@@ -261,6 +244,9 @@ static int32_t g_ggmltensor_idx = 0; //ensure every QNN tensor name is unique
         }                                                                       \
     } while (0)
 
+// =================================================================================================
+//  section-2: data type, data structure, global vars
+// =================================================================================================
 using pfn_rpc_mem_init                          = void (*)(void);
 using pfn_rpc_mem_deinit                        = void (*)(void);
 using pfn_rpc_mem_alloc                         = void *(*)(int, uint32_t, int);
@@ -269,10 +255,20 @@ using pfn_rpc_mem_to_fd                         = int (*)(void *);
 using _pfn_QnnSaver_initialize                  = decltype(QnnSaver_initialize);
 using _pfn_QnnInterface_getProviders            = decltype(QnnInterface_getProviders);
 using _pfn_QnnSystemInterface_getProviders      = decltype(QnnSystemInterface_getProviders);
-using qnn_res_t                                 = std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>;
-using qnn_tensors_t                             = std::vector< Qnn_Tensor_t *>;
 
-enum class ggml_qnn_profile_level {
+//QNN resource management for the first technical approach(general approach in ggml-sycl or ggml-cann)
+using qnn_ptensors_t                            = std::vector< Qnn_Tensor_t *>;
+using qnn_singlenode_res_t                      = std::tuple<Qnn_GraphHandle_t, qnn_ptensors_t>;
+
+//QNN resource management for the second technical approach(mapping the entire cgraph to a single QNN graph)
+using qnn_tensors_t                             = std::vector< Qnn_Tensor_t >;
+using qnn_tensor_pair_t                         = std::tuple< ggml_tensor *, Qnn_Tensor_t *>;
+using qnn_tensor_pairs_t                        = std::vector< qnn_tensor_pair_t >;
+using qnn_cgraph_node_t                         = std::tuple<std::string, qnn_tensor_pairs_t>;
+using qnn_cgraph_nodes_t                        = std::vector<qnn_cgraph_node_t>;
+using qnn_multinode_res_t                       = std::tuple<Qnn_GraphHandle_t, qnn_cgraph_nodes_t, qnn_ptensors_t, qnn_tensors_t, qnn_tensors_t>;
+
+enum class qnn_profile_level {
     profile_off     = 0,
     profile_basic   = 1,
     profile_detail  = 2
@@ -322,17 +318,71 @@ struct ggml_backend_qnn_context {
     QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
     struct qcom_socinfo           socinfo;
 
+    //QNN resource management for the first technical approach(general approach in ggml-sycl or ggml-cann)
+    std::map<std::string, qnn_singlenode_res_t> qnn_singlenode_graph_map;
+    //QNN resource management for the second technical approach(mapping the entire cgraph to a single QNN graph)
+    std::map<std::string, qnn_multinode_res_t> qnn_multinode_graph_map;
+
     std::unique_ptr<char[]> work_data;
     std::vector<std::future<void>> tasks;
-    size_t work_size    = 0;
-    size_t desired_size = 0;
-    int n_threads       = GGML_DEFAULT_N_THREADS;
+    size_t work_size;
+    size_t desired_size;
+    int n_threads;
+};
+
+struct qnn_op_caps {
+    bool supported;
+    ggml_op op;
+    const char * qnn_op_name;
+    const size_t input_param_count;
+    const char * qnn_param_name;
+};
+
+struct qnn_parameter {
+    int print_qnn_internal_log; // enable/disable QNN's internal log
+    int enable_perf;            // enable/disable perf of op function
+    int print_tensors_info;     // enable/disable print tensors info in op function
+    int dump_op_info;           // enable/disable dump op info in handle_op
+    int precision_mode;         // 0: default 1:fp16
+    int hvx_threads;
+    int vtcm_size_in_mb;
+    int enable_dlbc;
+    int inference_approach;     // 0: general approach,similar to ggml-sycl or ggml-cann 1: mapping entire ggml cgraph to QNN graph
+    int qnn_backend;            // 0: QNN-CPU backend, 1: QNN-GPU backend, 2: QNN-NPU backend
+    const char * qnn_cfgfilename;
+    const char * qnn_runtimelib_path;
 };
 
-struct qnn_op_caps_t {
-    const char * qnn_op_name        = nullptr;
-    const size_t input_param_count  = 0;
-    const char * qnn_param_name     = nullptr;
+//TODO:I don't think threadsafe is required at the moment
+//     so we can uniform them to avoid compiler/toolchain's complains
+#if !defined(__ANDROID__) && !defined(__linux__)
+static std::atomic<int32_t> g_qnntensor_idx(0); //ensure every QNN tensor name is unique
+static std::atomic<int32_t> g_qnnopcfg_idx(0);  //ensure every QNN opconfig name is unique
+#else
+static int32_t g_qnntensor_idx = 0; //ensure every QNN tensor name is unique
+static int32_t g_qnnopcfg_idx  = 0; //ensure every QNN opconfig name is unique
+#endif
+
+static struct qnn_parameter g_qnn_params = {
+        .print_qnn_internal_log = 0,
+        .enable_perf            = 0,
+        .print_tensors_info     = 0,
+        .dump_op_info           = 0,
+        .precision_mode         = 0,
+        .hvx_threads            = 4,
+        .vtcm_size_in_mb        = 8,
+        .enable_dlbc            = 1,
+        .inference_approach     = 0,
+        .qnn_backend            = 2, //default is QNN-NPU backend
+        .qnn_cfgfilename        = "ggml-qnn.cfg",
+#if defined(__ANDROID__)
+//Android command line program
+        .qnn_runtimelib_path    = "/data/local/tmp/",
+#elif defined(__linux__)
+        .qnn_runtimelib_path    = "/tmp/",
+#elif defined(_WIN32)
+        .qnn_runtimelib_path    = "C:\\",
+#endif
 };
 
 //file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
@@ -464,118 +514,115 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .socinfo              = {}},
 };
 
-static const qnn_op_caps_t ggmlqnn_k_op_caps[] = {
-        {}, // GGML_OP_NONE
-        {}, // GGML_OP_DUP
-        {
-                // GGML_OP_ADD
-                QNN_OP_ELEMENT_WISE_ADD,
-                2,
-        },
-        {}, // GGML_OP_ADD1
-        {}, // GGML_OP_ACC
-        {}, // GGML_OP_SUB
-        {
-                // GGML_OP_MUL
-                QNN_OP_ELEMENT_WISE_MULTIPLY,
-                2,
-        },
-        {}, // GGML_OP_DIV
-        {}, // GGML_OP_SQR
-        {}, // GGML_OP_SQRT
-        {}, // GGML_OP_LOG
-        {}, // GGML_OP_SIN
-        {}, // GGML_OP_COS
-        {}, // GGML_OP_SUM
-        {}, // GGML_OP_SUM_ROWS
-        {}, // GGML_OP_MEAN
-        {}, // GGML_OP_ARGMAX
-        {}, // GGML_OP_COUNT_EQUAL
-        {}, // GGML_OP_REPEAT
-        {}, // GGML_OP_REPEAT_BACK
-        {}, // GGML_OP_CONCAT
-        {}, // GGML_OP_SILU_BACK
-        {}, // GGML_OP_NORM
-        {}, // GGML_OP_RMS_NORM
-        {}, // GGML_OP_RMS_NORM_BACK
-        {}, // GGML_OP_GROUP_NORM
-        {
-                // GGML_OP_MUL_MAT
-                QNN_OP_MAT_MUL,
-                2,
-        },
-        {}, // GGML_OP_MUL_MAT_ID
-        {}, // GGML_OP_OUT_PROD
-        {}, // GGML_OP_SCALE
-        {}, // GGML_OP_SET
-        {}, // GGML_OP_CPY
-        {}, // GGML_OP_CONT
-        {}, // GGML_OP_RESHAPE
-        {}, // GGML_OP_VIEW
-        {}, // GGML_OP_PERMUTE
-        {}, // GGML_OP_TRANSPOSE
-        {}, // GGML_OP_GET_ROWS
-        {}, // GGML_OP_GET_ROWS_BACK
-        {}, // GGML_OP_DIAG
-        {}, // GGML_OP_DIAG_MASK_INF
-        {}, // GGML_OP_DIAG_MASK_ZERO
-        {}, // GGML_OP_SOFT_MAX
-        {}, // GGML_OP_SOFT_MAX_BACK
-        {}, // GGML_OP_ROPE
-        {}, // GGML_OP_ROPE_BACK
-        {}, // GGML_OP_CLAMP
-        {}, // GGML_OP_CONV_TRANSPOSE_1D
-        {}, // GGML_OP_IM2COL
-        {}, // GGML_OP_IM2COL_BACK
-        {}, // GGML_OP_CONV_TRANSPOSE_2D
-        {}, // GGML_OP_POOL_1D
-        {}, // GGML_OP_POOL_2D
-        {}, // GGML_OP_POOL_2D_BACK
-        {}, // GGML_OP_UPSCALE
-        {}, // GGML_OP_PAD
-        {}, // GGML_OP_PAD_REFLECT_1D
-        {}, // GGML_OP_ARANGE
-        {}, // GGML_OP_TIMESTEP_EMBEDDING
-        {}, // GGML_OP_ARGSORT
-        {}, // GGML_OP_LEAKY_RELU
-        {}, // GGML_OP_FLASH_ATTN_EXT
-        {}, // GGML_OP_FLASH_ATTN_BACK
-        {}, // GGML_OP_SSM_CONV
-        {}, // GGML_OP_SSM_SCAN
-        {}, // GGML_OP_WIN_PART
-        {}, // GGML_OP_WIN_UNPART
-        {}, // GGML_OP_GET_REL_POS
-        {}, // GGML_OP_ADD_REL_POS
-        {}, // GGML_OP_RWKV_WKV6
-        {}, // GGML_OP_GATED_LINEAR_ATTN
-        {}, // GGML_OP_UNARY
-        {}, // GGML_OP_MAP_UNARY
-        {}, // GGML_OP_MAP_BINARY
-        {}, // GGML_OP_MAP_CUSTOM1_F32
-        {}, // GGML_OP_MAP_CUSTOM2_F32
-        {}, // GGML_OP_MAP_CUSTOM3_F32
-        {}, // GGML_OP_MAP_CUSTOM1
-        {}, // GGML_OP_MAP_CUSTOM2
-        {}, // GGML_OP_MAP_CUSTOM3
-        {}, // GGML_OP_CROSS_ENTROPY_LOSS
-        {}, // GGML_OP_CROSS_ENTROPY_LOSS_BACK
-        {}, // GGML_OP_OPT_STEP_ADAMW
-        {}, // GGML_UNARY_OP_ABS
-        {}, // GGML_UNARY_OP_SGN
-        {}, // GGML_UNARY_OP_NEG
-        {}, // GGML_UNARY_OP_STEP
-        {}, // GGML_UNARY_OP_TANH
-        {}, // GGML_UNARY_OP_ELU
-        {}, // GGML_UNARY_OP_RELU
-        {}, // GGML_UNARY_OP_SIGMOID
-        {}, // GGML_UNARY_OP_GELU
-        {}, // GGML_UNARY_OP_GELU_QUICK
-        {}, // GGML_UNARY_OP_SILU
-        {}, // GGML_UNARY_OP_HARDSWISH
-        {}, // GGML_UNARY_OP_HARDSIGMOID
-        {}, // GGML_UNARY_OP_EXP
+static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = {
+        {true,  GGML_OP_NONE, nullptr, 0, nullptr},
+        {false, GGML_OP_DUP},
+        {true,  GGML_OP_ADD, QNN_OP_ELEMENT_WISE_ADD, 2},
+        {false, GGML_OP_ADD1},
+        {false, GGML_OP_ACC},
+        {true,  GGML_OP_SUB, QNN_OP_ELEMENT_WISE_SUBTRACT, 2},
+        {true,  GGML_OP_MUL, QNN_OP_ELEMENT_WISE_MULTIPLY, 2},
+        {true,  GGML_OP_DIV, QNN_OP_ELEMENT_WISE_DIVIDE, 2},
+        {false, GGML_OP_SQR},
+        {true,  GGML_OP_SQRT, QNN_OP_ELEMENT_WISE_SQUARE_ROOT, 1},
+        {true,  GGML_OP_LOG, QNN_OP_ELEMENT_WISE_LOG, 1},
+        {false, GGML_OP_SIN},
+        {false, GGML_OP_COS},
+        {false, GGML_OP_SUM},
+        {false, GGML_OP_SUM_ROWS},
+        {false, GGML_OP_MEAN},
+        {false, GGML_OP_ARGMAX},
+        {false, GGML_OP_COUNT_EQUAL},
+        {false, GGML_OP_REPEAT},
+        {false, GGML_OP_REPEAT_BACK},
+        {false, GGML_OP_CONCAT},
+        {false, GGML_OP_SILU_BACK},
+        {false, GGML_OP_NORM},
+        {false, GGML_OP_RMS_NORM},
+        {false, GGML_OP_RMS_NORM_BACK},
+        {false, GGML_OP_GROUP_NORM},
+        {false, GGML_OP_L2_NORM},
+        {true,  GGML_OP_MUL_MAT, QNN_OP_MAT_MUL, 2},
+        {false, GGML_OP_MUL_MAT_ID},
+        {false, GGML_OP_OUT_PROD},
+        {false, GGML_OP_SCALE},
+        {false, GGML_OP_SET},
+        {false, GGML_OP_CPY},
+        {false, GGML_OP_CONT},
+        {false, GGML_OP_RESHAPE},
+        {false, GGML_OP_VIEW},
+        {false, GGML_OP_PERMUTE},
+        {false, GGML_OP_TRANSPOSE},
+        {false, GGML_OP_GET_ROWS},
+        {false, GGML_OP_GET_ROWS_BACK},
+        {false, GGML_OP_DIAG},
+        {false, GGML_OP_DIAG_MASK_INF},
+        {false, GGML_OP_DIAG_MASK_ZERO},
+        {false, GGML_OP_SOFT_MAX},
+        {false, GGML_OP_SOFT_MAX_BACK},
+        {false, GGML_OP_ROPE},
+        {false, GGML_OP_ROPE_BACK},
+        {false, GGML_OP_CLAMP},
+        {false, GGML_OP_CONV_TRANSPOSE_1D},
+        {false, GGML_OP_IM2COL},
+        {false, GGML_OP_IM2COL_BACK},
+        {false, GGML_OP_CONV_TRANSPOSE_2D},
+        {false, GGML_OP_POOL_1D},
+        {false, GGML_OP_POOL_2D},
+        {false, GGML_OP_POOL_2D_BACK},
+        {false, GGML_OP_UPSCALE},
+        {false, GGML_OP_PAD},
+        {false, GGML_OP_PAD_REFLECT_1D},
+        {false, GGML_OP_ARANGE},
+        {false, GGML_OP_TIMESTEP_EMBEDDING},
+        {false, GGML_OP_ARGSORT},
+        {false, GGML_OP_LEAKY_RELU},
+        {false, GGML_OP_FLASH_ATTN_EXT},
+        {false, GGML_OP_FLASH_ATTN_BACK},
+        {false, GGML_OP_SSM_CONV},
+        {false, GGML_OP_SSM_SCAN},
+        {false, GGML_OP_WIN_PART},
+        {false, GGML_OP_WIN_UNPART},
+        {false, GGML_OP_GET_REL_POS},
+        {false, GGML_OP_ADD_REL_POS},
+        {false, GGML_OP_RWKV_WKV6},
+        {false, GGML_OP_GATED_LINEAR_ATTN},
+        {false, GGML_OP_RWKV_WKV7},
+        {false, GGML_OP_UNARY},
+        {false, GGML_OP_MAP_UNARY},
+        {false, GGML_OP_MAP_BINARY},
+        {false, GGML_OP_MAP_CUSTOM1_F32},
+        {false, GGML_OP_MAP_CUSTOM2_F32},
+        {false, GGML_OP_MAP_CUSTOM3_F32},
+        {false, GGML_OP_MAP_CUSTOM1},
+        {false, GGML_OP_MAP_CUSTOM2},
+        {false, GGML_OP_MAP_CUSTOM3},
+        {false, GGML_OP_CROSS_ENTROPY_LOSS},
+        {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK},
+        {false, GGML_OP_OPT_STEP_ADAMW},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_ABS)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SGN)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_NEG)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_STEP)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_TANH)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_ELU)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_RELU)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SIGMOID)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU_QUICK)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SILU)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSWISH)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSIGMOID)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_EXP)}
 };
 
+static_assert(ggmlqnn_k_op_caps[GGML_OP_NONE].supported,    "GGML_OP_NONE is not true");
+static_assert(ggmlqnn_k_op_caps[GGML_OP_ADD].supported,     "GGML_OP_ADD is not true");
+static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL].supported,     "GGML_OP_MUL is not true");
+static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL_MAT].supported, "GGML_OP_MUL_MAT is not true");
+static_assert(std::size(ggmlqnn_k_op_caps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
+        "pls check ggmlqnn_k_op_caps and ensure is corresponding to latest ggml.h");
+
 // =================================================================================================
 //  section-3: ggml-qnn internal troubleshooting function/class
 // =================================================================================================
@@ -608,7 +655,84 @@ static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const
     }
 }
 
-#if ENABLE_QNNBACKEND_PERF
+static void ggmlqnn_print_tensors_info(const char * func_name, const ggml_backend_qnn_context * ctx,
+                const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
+    //skip sanity check of params because of performance concern
+    if (0 == g_qnn_params.print_tensors_info)
+        return;
+
+    if (nullptr != func_name && nullptr != ctx) {
+        GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
+    }
+    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
+                      src0->name,
+                      src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+                      src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
+    if (nullptr != src1) {
+        GGMLQNN_LOG_DEBUG(
+                "%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
+                src1->name,
+                src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
+                src1->ne[3],
+                src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
+    }
+    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
+                      dst->name,
+                      dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
+                      dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
+    GGMLQNN_LOG_DEBUG("\n");
+}
+
+static void ggmlqnn_dump_op_info(const struct ggml_tensor * tensor) {
+    //skip sanity check of params because of performance concern
+    if (0 == g_qnn_params.dump_op_info)
+        return;
+
+    const struct ggml_tensor * src0 = tensor->src[0];
+    struct ggml_tensor       * src1 = tensor->src[1];
+    struct ggml_tensor       * dst  = const_cast<ggml_tensor *>(tensor);
+    GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), ggml_type_name(tensor->type));
+    ggmlqnn_print_tensors_info(nullptr, nullptr, src0, src1, dst);
+}
+
+static void ggmlqnn_dump_tensor_elements(const ggml_tensor * tensor) {
+    float value = 0;
+    std::ostringstream tmposs;
+    if (tensor->type == GGML_TYPE_F32) {
+        for (int h = 0; h < tensor->ne[3]; h++) {
+            for (int i = 0; i < tensor->ne[2]; i++) {
+                for (int j = 0; j < tensor->ne[1]; j++) {
+                    for (int k = 0; k < tensor->ne[0]; k++) {
+                        value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] +
+                                                         j * tensor->ne[0] + k];
+                        tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value
+                               << " ";
+                    }
+                    if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) {
+                        GGMLQNN_LOG_DEBUG("%s\n", tmposs.str().c_str());
+                    }
+                    tmposs.clear();
+                    tmposs.str("");
+                }
+            }
+        }
+    }
+
+    GGMLQNN_LOG_DEBUG("\n");
+}
+
+static void ggmlqnn_dump_tensor(const ggml_tensor * tensor, const char * name) {
+    GGMLQNN_LOG_DEBUG("dump ggml tensor %s(%s)\n", name, tensor->name);
+    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n",
+                      name,
+                      tensor->type, ggml_type_name(tensor->type),
+                      tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
+                      tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[2]);
+    ggmlqnn_dump_tensor_elements(tensor);
+
+    GGMLQNN_LOG_DEBUG("\n");
+}
+
 class qnn_perf {
 public:
     qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
@@ -617,10 +741,14 @@ class qnn_perf {
     qnn_perf & operator= (const qnn_perf & ) = delete;
 
     void start() {
+        if (0 == g_qnn_params.enable_perf)
+            return;
         _begin_time = ggml_time_us();
     }
 
     void info() {
+        if (0 == g_qnn_params.enable_perf)
+            return;
         _end_time = ggml_time_us();
         _duration = (_end_time - _begin_time);
         GGMLQNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
@@ -632,160 +760,6 @@ class qnn_perf {
     int64_t _duration   = 0LL;
     std::string _perf_name;
 };
-#else
-class qnn_perf {
-public:
-    qnn_perf(const std::string & perf_name) {
-        GGML_UNUSED(perf_name);
-    }
-    qnn_perf() = delete;
-    qnn_perf(const qnn_perf & ) = delete;
-    qnn_perf & operator= (const qnn_perf & ) = delete;
-
-    void start() {}
-    void info() {}
-};
-#endif
-
-class qnn_cfg {
-public:
-    void dump(std::function<void(const std::string &, const std::string &, const std::string &)> worker) {
-        if (!_load_success) {
-            GGMLQNN_LOG_INFO("qnn cfg file %s not loadded", _cfg_filename.c_str());
-            return;
-        }
-        auto iter = _qnn_cfg.begin();
-        while (iter != _qnn_cfg.end()) {
-            auto kv_iter = iter->second.begin();
-            while (kv_iter != iter->second.end()) {
-                worker(iter->first, kv_iter->first, kv_iter->second);
-                ++kv_iter;
-            }
-            ++iter;
-        }
-    }
-
-    bool load(const std::string & file_name) {
-        if (file_name == "") {
-            return false;
-        }
-        _cfg_filename = file_name;
-        std::ifstream in;
-        std::string line;
-        in.open(file_name.c_str());
-        if (not in.is_open()) {
-            GGMLQNN_LOG_WARN("can't open file %s", file_name.c_str());
-            return false;
-        }
-        while (getline(in, line)) {
-            std::string section, key, value;
-            if (not parse_line(line, section, key, value)) {
-                continue;
-            }
-            set_section_keyvalue(section, key, value);
-        }
-        _load_success = true;
-        return true;
-    }
-
-    void get_stringvalue(const std::string & section, const std::string & key, std::string & value, std::string default_value) {
-        value = default_value;
-        if (_qnn_cfg.find(section) == _qnn_cfg.end()) {
-            return;
-        }
-        if (_qnn_cfg[section].find(key) == _qnn_cfg[section].end()) {
-            return;
-        }
-        value = _qnn_cfg[section][key];
-    }
-
-    void get_intvalue(const std::string & section, const std::string & key, int & value, int default_value) {
-        value = default_value;
-        if (_qnn_cfg.find(section) == _qnn_cfg.end()) {
-            return;
-        }
-        if (_qnn_cfg[section].find(key) == _qnn_cfg[section].end()) {
-            return;
-        }
-        value = atol(_qnn_cfg[section][key].c_str());
-    }
-
-private:
-    void ltrim(std::string & str) {
-        if (str.empty()) return;
-        size_t len = 0;
-        char* temp = (char*)str.c_str();
-        while (*temp && isblank(*temp)) {
-            ++len;
-            ++temp;
-        }
-        if (len > 0) str.erase(0, len);
-    }
-
-    void rtrim(std::string & str) {
-        if (str.empty()) return;
-        size_t len = str.length();
-        size_t pos = len;
-        while (pos > 0) {
-            if (not isblank(str[pos - 1])) {
-                break;
-            }
-            --pos;
-        }
-        if (pos != len) str.erase(pos);
-    }
-
-    void trim(std::string& str) {
-        ltrim(str);
-        rtrim(str);
-    }
-
-    void set_section_keyvalue(std::string & section, std::string & key, std::string & value) {
-        if (_qnn_cfg.find(section) == _qnn_cfg.end()) {
-            std::unordered_map<std::string, std::string> kv_map;
-            _qnn_cfg[section] = kv_map;
-        }
-        if (key != "" && value != "") _qnn_cfg[section][key] = value;
-    }
-
-    bool parse_line(std::string & line, std::string & section, std::string & key, std::string & value) {
-        static std::string cur_section = "";
-        std::string nodes[2] = {"#", ";"};
-        for (int i = 0; i < 2; ++i) {
-            std::string::size_type pos = line.find(nodes[i]);
-            if (pos != std::string::npos) line.erase(pos);
-        }
-        trim(line);
-        if (line == "") return false;
-        if (line[0] == '[' && line[line.size() - 1] == ']') {
-            section = line.substr(1, line.size() - 2);
-            trim(section);
-            cur_section = section;
-            return false;
-        }
-        if (cur_section == "") return false;
-        bool is_key = true;
-        for (size_t i = 0; i < line.size(); ++i) {
-            if (line[i] == '=') {
-                is_key = false;
-                continue;
-            }
-            if (is_key) {
-                key += line[i];
-            } else {
-                value += line[i];
-            }
-        }
-        section = cur_section;
-        trim(key);
-        trim(value);
-        return true;
-    }
-private:
-    std::unordered_map<std::string, std::unordered_map<std::string, std::string>> _qnn_cfg;
-    bool _load_success = false;
-    std::string _cfg_filename;
-};
 
 // =================================================================================================
 //  section-4: helper function for WoA(Window on ARM)
@@ -845,20 +819,36 @@ static const char * dlerror(void) {
 // =================================================================================================
 //  section-5: general helper function
 // =================================================================================================
-//the following 3 helper funcs are used to ensure every QNN tensor name is unique
-static void ggmqnn_reset_tensoridx() {
-    g_ggmltensor_idx = 0;
+//TODO: merge the following 6 helper functions which used to ensure every QNN tensor/opcfg name is unique
+static void ggmlqnn_reset_tensoridx() {
+    g_qnntensor_idx = 0;
 }
 
-static void ggmqnn_inc_tensoridx() {
-    g_ggmltensor_idx++;
+static void ggmlqnn_inc_tensoridx() {
+    g_qnntensor_idx++;
 }
 
-static int32_t ggmqnn_get_tensoridx() {
+static int32_t ggmlqnn_get_tensoridx() {
 #if !defined(__ANDROID__) && !defined(__linux__)
-    return g_ggmltensor_idx.load();
+    return g_qnntensor_idx.load();
 #else
-    return g_ggmltensor_idx;
+    return g_qnntensor_idx;
+#endif
+}
+
+static void ggmlqnn_reset_opcfgidx() {
+    g_qnnopcfg_idx = 0;
+}
+
+static void ggmlqnn_inc_opcfgidx() {
+    g_qnnopcfg_idx++;
+}
+
+static int32_t ggmlqnn_get_opcfgidx() {
+#if !defined(__ANDROID__) && !defined(__linux__)
+    return g_qnnopcfg_idx.load();
+#else
+    return g_qnnopcfg_idx;
 #endif
 }
 
@@ -989,23 +979,18 @@ static void * ggmlqnn_host_malloc(size_t buffer_size, size_t page_size) {
     return data;
 }
 
-static void ggmlqnn_load_cfg() {
-    std::string cfg_filename = std::string(g_qnn_runtimelib_path) + std::string(g_qnn_cfgfilename);
-    GGMLQNN_LOG_INFO("load ggml-qnn config from %s", cfg_filename.c_str());
-    qnn_cfg qnncfg_instance;
-    qnncfg_instance.load(cfg_filename);
-    qnncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
-        std::ostringstream  tmposs;
-        tmposs << "section[" << section << "],[" << key << "] = [" << value << "]" << std::endl;
-        GGMLQNN_LOG_INFO("%s", tmposs.str().c_str());
-    });
-    std::string npu_inference_datatype;
-    qnncfg_instance.get_intvalue("general", "print_qnn_internal_log", g_print_qnn_internal_log, 0);
-    qnncfg_instance.get_intvalue("general", "inference_approach", g_inference_approach, 0);
-    qnncfg_instance.get_stringvalue("npu", "npu_inference_datatype", npu_inference_datatype, "fp32");
-    GGMLQNN_LOG_INFO("print_qnn_internal_log=%d", g_print_qnn_internal_log);
-    GGMLQNN_LOG_INFO("inference_approach=%d", g_inference_approach);
-    GGMLQNN_LOG_INFO("npu inference data type=%s", npu_inference_datatype.c_str());
+static void ggmlqnn_get_timestring(char * p_currenttime) {
+    time_t n_seconds    = 0;
+    struct tm * p_tm    = nullptr;
+
+    if (nullptr == p_currenttime)
+        return;
+
+    time(&n_seconds);
+    p_tm = localtime(&n_seconds);
+    snprintf(p_currenttime, GGML_QNN_TMPBUF_LEN, "%04d-%02d-%02d-%02d-%02d-%02d",
+             p_tm->tm_year + 1900, p_tm->tm_mon + 1, p_tm->tm_mday,
+             p_tm->tm_hour, p_tm->tm_min, p_tm->tm_sec);
 }
 
 // =================================================================================================
@@ -1015,7 +1000,6 @@ static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.id;
     }
-
     return 0u;
 }
 
@@ -1336,7 +1320,19 @@ static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * p
                                        Qnn_Param_t * params, uint32_t num_params,
                                        Qnn_Tensor_t * inputs, uint32_t num_inputs,
                                        Qnn_Tensor_t * outputs, uint32_t num_outputs) {
-    Qnn_OpConfigV1_t v1 = {name, package, type,
+
+    char opcfg_name[GGML_MAX_NAME] = {};
+
+    //ensure the opcfg name is unique
+    if (nullptr == name) {
+        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%-8d", ggmlqnn_get_opcfgidx());
+    } else {
+        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%s_%-8d", name, ggmlqnn_get_opcfgidx());
+    }
+    GGMLQNN_LOG_DEBUG("create qnn opconfig %s", opcfg_name);
+    ggmlqnn_inc_opcfgidx();
+
+    Qnn_OpConfigV1_t v1 = {opcfg_name, package, type,
                            num_params, params,
                            num_inputs, inputs,
                            num_outputs, outputs
@@ -1531,7 +1527,7 @@ static void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * o
 }
 
 static void ggmlqnn_append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) {
-    char buffer[256] = {};
+    char buffer[GGML_QNN_TMPBUF_LEN] = {};
     const char * type_name = ggmlqnn_get_ggml_type_name(tensor->type);
     int len = 0;
     switch (ggml_n_dims(tensor)) {
@@ -1583,64 +1579,207 @@ static void ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & o
         if (!input) {
             break;
         }
-        output += '_';
-        ggmlqnn_append_tensor_dimensions(input, output);
+        output += '_';
+        ggmlqnn_append_tensor_dimensions(input, output);
+    }
+}
+
+static void ggmlqnn_get_opkey_with_srcop_desc(const ggml_tensor * op, std::string & output) {
+    output += ggml_op_desc(op);
+    output += '(';
+    if (op->src[0]) {
+        output += ggml_op_desc(op->src[0]);
+    }
+    for (size_t i = 1; i < GGML_MAX_DIMS && op->src[i]; ++i) {
+        output += ',';
+        output += ggml_op_desc(op->src[i]);
+    }
+    output += ')';
+}
+
+static void ggmlqnn_get_graphkey_from_cgraph(const ggml_cgraph * cgraph, std::string & output) {
+    if (nullptr == cgraph || 0 == cgraph->n_nodes) {
+        GGMLQNN_LOG_WARN("empty ggml computational graph");
+        return;
+    }
+
+    //output += "cgraph_" + std::to_string(ggml_time_us());
+    //return;
+
+    bool is_start = true;
+    for (int i = 0; i < cgraph->n_nodes; ++i) {
+        auto * op = cgraph->nodes[i];
+        if (ggml_is_empty(op)) {
+            GGMLQNN_LOG_WARN("empty op in graph, skipping");
+            continue;
+        }
+
+        if (op->op == GGML_OP_NONE) {
+            GGMLQNN_LOG_WARN("GGML_OP_NONE in graph, skipping");
+            continue;
+        }
+
+        if (is_start) {
+            ggmlqnn_get_graphkey_from_op(cgraph->nodes[0], output);
+            is_start = false;
+        } else {
+            output += '#';
+            ggmlqnn_get_opkey_with_srcop_desc(op, output);
+        }
+    }
+
+    if (cgraph->n_nodes > 1) {
+        auto * last_op = cgraph->nodes[cgraph->n_nodes - 1];
+        output += ggmlqnn_get_ggml_type_name(last_op->type);
+        output += '_';
+        ggmlqnn_append_tensor_dimensions(last_op, output);
+    }
+}
+
+template<typename Fn>
+Fn ggmlqnn_load_qnn_functionpointers(void * handle, const char * function_name) {
+    return reinterpret_cast<Fn>(dlsym(handle, function_name));
+}
+
+class qnn_cfg {
+public:
+    void dump(std::function<void(const std::string &, const std::string &, const std::string &)> worker) {
+        if (!_load_success) {
+            GGMLQNN_LOG_INFO("qnn cfg file %s not loaded", _cfg_filename.c_str());
+            return;
+        }
+        auto iter = _qnn_cfg.begin();
+        while (iter != _qnn_cfg.end()) {
+            auto kv_iter = iter->second.begin();
+            while (kv_iter != iter->second.end()) {
+                worker(iter->first, kv_iter->first, kv_iter->second);
+                ++kv_iter;
+            }
+            ++iter;
+        }
+    }
+
+    bool load(const std::string & file_name) {
+        if (file_name == "") {
+            return false;
+        }
+        _cfg_filename = file_name;
+        std::ifstream in;
+        std::string line;
+        in.open(file_name.c_str());
+        if (not in.is_open()) {
+            GGMLQNN_LOG_WARN("can't open file %s", file_name.c_str());
+            return false;
+        }
+        while (getline(in, line)) {
+            std::string section, key, value;
+            if (not parse_line(line, section, key, value)) {
+                continue;
+            }
+            set_section_keyvalue(section, key, value);
+        }
+        _load_success = true;
+        return true;
+    }
+
+    void get_stringvalue(const std::string & section, const std::string & key, std::string & value, std::string default_value) {
+        value = default_value;
+        if (_qnn_cfg.find(section) == _qnn_cfg.end()) {
+            return;
+        }
+        if (_qnn_cfg[section].find(key) == _qnn_cfg[section].end()) {
+            return;
+        }
+        value = _qnn_cfg[section][key];
     }
-}
 
-static void ggmlqnn_get_opkey_with_srcop_desc(const ggml_tensor * op, std::string & output) {
-    output += ggml_op_desc(op);
-    output += '(';
-    if (op->src[0]) {
-        output += ggml_op_desc(op->src[0]);
-    }
-    for (size_t i = 1; i < GGML_MAX_DIMS && op->src[i]; ++i) {
-        output += ',';
-        output += ggml_op_desc(op->src[i]);
+    void get_intvalue(const std::string & section, const std::string & key, int & value, int default_value) {
+        value = default_value;
+        if (_qnn_cfg.find(section) == _qnn_cfg.end()) {
+            return;
+        }
+        if (_qnn_cfg[section].find(key) == _qnn_cfg[section].end()) {
+            return;
+        }
+        value = atol(_qnn_cfg[section][key].c_str());
     }
-    output += ')';
-}
 
-static void ggmlqnn_get_graphkey_from_cgraph(const ggml_cgraph * cgraph, std::string & output) {
-    if (nullptr == cgraph || 0 == cgraph->n_nodes) {
-        GGMLQNN_LOG_WARN("empty ggml computational graph");
-        return;
+private:
+    void ltrim(std::string & str) {
+        if (str.empty()) return;
+        size_t len = 0;
+        char* temp = (char*)str.c_str();
+        while (*temp && isblank(*temp)) {
+            ++len;
+            ++temp;
+        }
+        if (len > 0) str.erase(0, len);
     }
 
-    bool is_start = true;
-    for (int i = 0; i < cgraph->n_nodes; ++i) {
-        auto * op = cgraph->nodes[i];
-        if (ggml_is_empty(op)) {
-            GGMLQNN_LOG_WARN("empty op in graph, skipping");
-            continue;
+    void rtrim(std::string & str) {
+        if (str.empty()) return;
+        size_t len = str.length();
+        size_t pos = len;
+        while (pos > 0) {
+            if (not isblank(str[pos - 1])) {
+                break;
+            }
+            --pos;
         }
+        if (pos != len) str.erase(pos);
+    }
 
-        if (op->op == GGML_OP_NONE) {
-            GGMLQNN_LOG_WARN("GGML_OP_NONE in graph, skipping");
-            continue;
-        }
+    void trim(std::string& str) {
+        ltrim(str);
+        rtrim(str);
+    }
 
-        if (is_start) {
-            ggmlqnn_get_graphkey_from_op(cgraph->nodes[0], output);
-            is_start = false;
-        } else {
-            output += '#';
-            ggmlqnn_get_opkey_with_srcop_desc(op, output);
+    void set_section_keyvalue(std::string & section, std::string & key, std::string & value) {
+        if (_qnn_cfg.find(section) == _qnn_cfg.end()) {
+            std::unordered_map<std::string, std::string> kv_map;
+            _qnn_cfg[section] = kv_map;
         }
+        if (key != "" && value != "") _qnn_cfg[section][key] = value;
     }
 
-    if (cgraph->n_nodes > 1) {
-        auto * last_op = cgraph->nodes[cgraph->n_nodes - 1];
-        output += ggmlqnn_get_ggml_type_name(last_op->type);
-        output += '_';
-        ggmlqnn_append_tensor_dimensions(last_op, output);
+    bool parse_line(std::string & line, std::string & section, std::string & key, std::string & value) {
+        static std::string cur_section = "";
+        std::string nodes[2] = {"#", ";"};
+        for (int i = 0; i < 2; ++i) {
+            std::string::size_type pos = line.find(nodes[i]);
+            if (pos != std::string::npos) line.erase(pos);
+        }
+        trim(line);
+        if (line == "") return false;
+        if (line[0] == '[' && line[line.size() - 1] == ']') {
+            section = line.substr(1, line.size() - 2);
+            trim(section);
+            cur_section = section;
+            return false;
+        }
+        if (cur_section == "") return false;
+        bool is_key = true;
+        for (size_t i = 0; i < line.size(); ++i) {
+            if (line[i] == '=') {
+                is_key = false;
+                continue;
+            }
+            if (is_key) {
+                key += line[i];
+            } else {
+                value += line[i];
+            }
+        }
+        section = cur_section;
+        trim(key);
+        trim(value);
+        return true;
     }
-}
-
-template<typename Fn>
-Fn ggmlqnn_load_qnn_functionpointers(void * handle, const char * function_name) {
-    return reinterpret_cast<Fn>(dlsym(handle, function_name));
-}
+private:
+    std::unordered_map<std::string, std::unordered_map<std::string, std::string>> _qnn_cfg;
+    bool _load_success = false;
+    std::string _cfg_filename;
+};
 
 class qnn_interface {
 #define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name)           \
@@ -1830,11 +1969,11 @@ class qnn_instance {
 
     bool is_valid_graph() const { return _qnn_graph_handle != nullptr; }
 
-    int init_htp_perfinfra();
+    int htp_init_perfinfra();
 
-    int set_rpc_polling();
+    int htp_set_rpc_polling();
 
-    int set_high_performance_mode();
+    int htp_set_high_performance_mode();
 
     std::string & get_qnn_graph_name() { return _graph_name; }
 
@@ -1877,9 +2016,6 @@ class qnn_instance {
         return _device_id;
     }
 
-public:
-    std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector< Qnn_Tensor_t *>>> _qnn_graph_map;
-
 private:
     int load_system();
 
@@ -1901,7 +2037,7 @@ class qnn_instance {
 
     void htp_print_info();
 
-    void htp_probe_device_meminfo();
+    void htp_probe_rpc_meminfo();
 
     void print_backend_info();
 
@@ -1924,7 +2060,7 @@ class qnn_instance {
     bool _do_node_validations               = true;  // flag to indicate whether all add_node calls need to be validated
     QnnLog_Level_t _qnn_log_level           = QNN_LOG_LEVEL_DEBUG;
 
-    ggml_qnn_profile_level _profile_level   = ggml_qnn_profile_level::profile_detail;
+    qnn_profile_level _profile_level   = qnn_profile_level::profile_off;
 
     void * _system_lib_handle               = nullptr;
     void * _loaded_lib_handle               = nullptr;
@@ -2314,7 +2450,7 @@ int qnn_instance::load_system() {
     if (nullptr == _system_lib_handle) {
         GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
         //re-try with default path of QNN binary runtime lib
-        _lib_path = std::string(g_qnn_runtimelib_path);
+        _lib_path = std::string(g_qnn_params.qnn_runtimelib_path);
 #if !defined(__ANDROID__) && !defined(__linux__)
         system_lib_path = _lib_path + "QnnSystem.dll";
 #else
@@ -2411,16 +2547,16 @@ int qnn_instance::unload_system() {
     return result;
 }
 
-static void ggml_qnn_logcallback(const char * fmt,
+static void ggmlqnn_compute_logcallback(const char * fmt,
                                  QnnLog_Level_t level,
                                  uint64_t timestamp,
                                  va_list argp) {
 
-    if (0 == g_print_qnn_internal_log)
+    if (0 == g_qnn_params.print_qnn_internal_log)
         return;
 
     static std::mutex log_mutex;
-    static unsigned char s_ggml_qnn_logbuf[GGML_QNN_LOGBUF_LEN];
+    static unsigned char s_ggmlqnn_compute_logbuf[GGML_QNN_LOGBUF_LEN];
 
     const char * log_level_desc = "";
     switch (level) {
@@ -2447,9 +2583,9 @@ static void ggml_qnn_logcallback(const char * fmt,
     double ms = (double) timestamp / 1000000.0;
     {
         std::lock_guard<std::mutex> lock(log_mutex);
-        memset(s_ggml_qnn_logbuf, 0, GGML_QNN_LOGBUF_LEN);
-        vsnprintf(reinterpret_cast<char *const>(s_ggml_qnn_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp);
-        GGMLQNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggml_qnn_logbuf);
+        memset(s_ggmlqnn_compute_logbuf, 0, GGML_QNN_LOGBUF_LEN);
+        vsnprintf(reinterpret_cast<char *const>(s_ggmlqnn_compute_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp);
+        GGMLQNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggmlqnn_compute_logbuf);
     }
 }
 
@@ -2489,9 +2625,9 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
 
     _qnn_interface.set_qnn_interface(_loaded_backend);
 #if 1
-    _qnn_interface.qnn_log_create(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle);
+    _qnn_interface.qnn_log_create(ggmlqnn_compute_logcallback, _qnn_log_level, &_qnn_log_handle);
 #else
-    _qnn_raw_interface.logCreate(ggml_qnn_logcallback, _qnn_log_level, &_qnn_log_handle);
+    _qnn_raw_interface.logCreate(ggmlqnn_compute_logcallback, _qnn_log_level, &_qnn_log_handle);
 #endif
     if (nullptr == _qnn_log_handle) {
         GGMLQNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone
@@ -2521,17 +2657,62 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         }
     }
 
-    auto qnnstatus = _qnn_raw_interface.deviceCreate(
-            _qnn_log_handle, nullptr, &_qnn_device_handle);
+    auto qnnstatus = QNN_SUCCESS;
+    if (_device_id == QNN_BACKEND_NPU) {
+        //TODO: remove duplicated code between here and function htp_print_info
+        const QnnDevice_PlatformInfo_t * p_info = nullptr;
+        qcom_socinfo soc_info = {};
+        qnnstatus = _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
+        if (qnnstatus == QNN_SUCCESS) {
+            GGMLQNN_LOG_INFO("device counts %d\n", p_info->v1.numHwDevices);
+            QnnDevice_HardwareDeviceInfo_t *         infos    = p_info->v1.hwDevices;
+            QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {};
+            for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) {
+                GGMLQNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId,
+                             (int) infos[i].v1.deviceType, (int) infos[i].v1.numCores);
+                QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
+                chipinfo                                = devinfo->onChipDevice;
+                size_t htp_arch                         = (size_t) chipinfo.arch;
+                GGMLQNN_LOG_INFO("htp_type:%d(%s)\n", devinfo->devType,
+                             (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : "");
+                soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize };
+            }
+            _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
+        } else {
+            GGMLQNN_LOG_WARN("failed to get platform info, are we in emulator?\n");
+            soc_info = { NONE, UNKNOWN_SM, 0 };
+        }
+
+        QnnHtpDevice_CustomConfig_t soc_customconfig;
+        soc_customconfig.option   = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
+        soc_customconfig.socModel = soc_info.soc_model;
+        QnnDevice_Config_t soc_devconfig;
+        soc_devconfig.option       = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+        soc_devconfig.customConfig = &soc_customconfig;
+
+        /*
+        QnnHtpDevice_CustomConfig_t arch_customconfig;
+        arch_customconfig.option        = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
+        arch_customconfig.arch.arch     = (QnnHtpDevice_Arch_t)soc_info.htp_arch;
+        arch_customconfig.arch.deviceId = 0;
+        QnnDevice_Config_t arch_devconfig;
+        arch_devconfig.option       = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+        arch_devconfig.customConfig = &arch_customconfig;
+        */
+        const QnnDevice_Config_t * p_deviceconfig[] = { &soc_devconfig, nullptr };
+        qnnstatus = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle);
+    } else {
+        qnnstatus = _qnn_interface.qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle);
+    }
     if (QNN_SUCCESS != qnnstatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnstatus) {
         GGMLQNN_LOG_WARN("failed to create QNN device\n");
     } else {
         GGMLQNN_LOG_INFO("create device successfully\n");
     }
 
-    if (ggml_qnn_profile_level::profile_off != _profile_level) {
+    if (qnn_profile_level::profile_off != _profile_level) {
         GGMLQNN_LOG_INFO("profiling turned on; level = %d", _profile_level);
-        if (ggml_qnn_profile_level::profile_basic == _profile_level) {
+        if (qnn_profile_level::profile_basic == _profile_level) {
             GGMLQNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n");
             if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
                     _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) {
@@ -2540,7 +2721,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
             } else {
                 GGMLQNN_LOG_DEBUG("initialize qnn profile successfully\n");
             }
-        } else if (ggml_qnn_profile_level::profile_detail == _profile_level) {
+        } else if (qnn_profile_level::profile_detail == _profile_level) {
             GGMLQNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n");
             if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
                     _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) {
@@ -2553,7 +2734,14 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
     }
 
 #if defined(__ANDROID__) || defined(__linux__)
-    _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+    //_rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+    std::filesystem::path full_path(std::string(g_qnn_params.qnn_runtimelib_path) + "libcdsprpc.so");
+    full_path /= std::filesystem::path("libcdsprpc.so").filename();
+    _rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
+    if (nullptr == _rpc_lib_handle) {
+        GGMLQNN_LOG_WARN("failed to load %s\n", full_path.c_str());
+        _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+    }
 #else
     _rpc_lib_handle = dlopen("libcdsprpc.dll", RTLD_NOW | RTLD_LOCAL);
 #endif
@@ -2593,16 +2781,16 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
     if (_backend_name.find("Htp") != std::string::npos) {
         htp_print_info();
 
-        htp_probe_device_meminfo();
+        htp_probe_rpc_meminfo();
 
-        if (0 != init_htp_perfinfra()) {
+        if (0 != htp_init_perfinfra()) {
             GGMLQNN_LOG_WARN("initialize HTP performance failure");
         }
-#if 0
-        if (0 != set_rpc_polling()) {
+#if 1
+        if (0 != htp_set_rpc_polling()) {
             GGMLQNN_LOG_WARN("set RPC polling failure");
         }
-        if (0 != set_high_performance_mode()) {
+        if (0 != htp_set_high_performance_mode()) {
             GGMLQNN_LOG_WARN("set HTP high performance mode failure");
         }
 #else
@@ -2628,7 +2816,8 @@ int qnn_instance::qnn_finalize() {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
     GGMLQNN_LOG_DEBUG("enter %s\n", __func__);
-    ggmqnn_reset_tensoridx();
+    ggmlqnn_reset_tensoridx();
+    ggmlqnn_reset_opcfgidx();
 
     free_rpcmem();
     unregister_rpcmem();
@@ -2706,7 +2895,6 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend devi
     GGMLQNN_LOG_DEBUG("[%s][%s]created", ggml_backend_qnn_get_devname(device), graph_name.c_str());
 
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    Qnn_GraphHandle_t graph_handle = nullptr;
     if (device == QNN_BACKEND_NPU) {
         QnnHtpGraph_CustomConfig_t hvx_config;
         hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
@@ -2715,42 +2903,52 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend devi
         graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
         graph_hvx_config.customConfig = &hvx_config;
 
-        QnnHtpGraph_CustomConfig_t dlbc_config;
+        QnnHtpGraph_CustomConfig_t dlbc_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
         dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
         dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
-        dlbc_config.optimizationOption.floatValue = 1.0; // set to 0.0 to turn off DLBC
+        if (0 == g_qnn_params.enable_dlbc)
+            dlbc_config.optimizationOption.floatValue = 0.0; // set to 0.0 to turn off DLBC
+        else
+            dlbc_config.optimizationOption.floatValue = 1.0; // set to 1.0 to turn on  DLBC
         QnnGraph_Config_t graph_dlbc_config;
         graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
         graph_dlbc_config.customConfig = &dlbc_config;
 
-        QnnHtpGraph_CustomConfig_t opt_config;
+        QnnHtpGraph_CustomConfig_t opt_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
+        opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
         opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
         opt_config.optimizationOption.floatValue = 1; // 1 / 3
         QnnGraph_Config_t graph_opt_config;
         graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
         graph_opt_config.customConfig = &opt_config;
 
-        QnnHtpGraph_CustomConfig_t vtcm_config;
+        QnnHtpGraph_CustomConfig_t vtcm_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
         vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
         vtcm_config.vtcmSizeInMB = vtcm_size_in_mb;
         QnnGraph_Config_t graph_vtcm_config;
         graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
         graph_vtcm_config.customConfig = &vtcm_config;
 
-        QnnHtpGraph_CustomConfig_t fp16_config;
-        fp16_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
-        fp16_config.precision = QNN_PRECISION_FLOAT16;
-        QnnGraph_Config_t graph_fp16_config;
-        graph_fp16_config.option       = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-        graph_fp16_config.customConfig = &fp16_config;
-
-        const QnnGraph_Config_t * graph_configs[] = {&graph_hvx_config, &graph_dlbc_config, &graph_vtcm_config,
-                                                    &graph_opt_config, &graph_fp16_config, nullptr};
-        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), graph_configs, &graph_handle);
+        std::vector<const QnnGraph_Config_t *> graph_configs;
+        graph_configs.push_back(&graph_hvx_config);
+        graph_configs.push_back(&graph_dlbc_config);
+        graph_configs.push_back(&graph_vtcm_config);
+        graph_configs.push_back(&graph_opt_config);
+        if (1 == g_qnn_params.precision_mode) {
+            QnnHtpGraph_CustomConfig_t fp16_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
+            fp16_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
+            fp16_config.precision = QNN_PRECISION_FLOAT16;
+            QnnGraph_Config_t graph_fp16_config;
+            graph_fp16_config.option       = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_fp16_config.customConfig = &fp16_config;
+            graph_configs.push_back(&graph_fp16_config);
+        }
+        graph_configs.push_back(nullptr);
+        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), graph_configs.data(), &_qnn_graph_handle);
+        GGMLQNN_LOG_DEBUG("[%s][%s]created graph %p", ggml_backend_qnn_get_devname(device), graph_name.c_str(), _qnn_graph_handle);
     } else {
-        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), nullptr, &graph_handle);
+        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), nullptr, &_qnn_graph_handle);
     }
-
     if (error != QNN_SUCCESS) {
         GGMLQNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s",
                       ggml_backend_qnn_get_devname(device), graph_name.c_str(),
@@ -2759,7 +2957,6 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend devi
     }
 
     GGMLQNN_LOG_DEBUG("[%s]create graph %s succeed", ggml_backend_qnn_get_devname(device), graph_name.c_str());
-    _qnn_graph_handle = graph_handle;
     if (device == QNN_BACKEND_NPU) {
         htp_set_n_hvx_threads(hvx_threads);
     }
@@ -2817,7 +3014,7 @@ int qnn_instance::finalize_qnn_graph() {
     return 0;
 }
 
-int qnn_instance::init_htp_perfinfra() {
+int qnn_instance::htp_init_perfinfra() {
     QnnDevice_Infrastructure_t device_infra = nullptr;
     int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra);
     if (error != QNN_SUCCESS) {
@@ -2840,57 +3037,6 @@ int qnn_instance::init_htp_perfinfra() {
     return 0;
 }
 
-int qnn_instance::set_rpc_polling() {
-    if (_qnn_rpc_pollingtime > 0) {
-        QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingtime;
-        memset(&rpc_pollingtime, 0, sizeof(rpc_pollingtime));
-        rpc_pollingtime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
-        rpc_pollingtime.rpcPollingTimeConfig = _qnn_rpc_pollingtime;
-        const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&rpc_pollingtime, nullptr};
-        if (_qnn_htp_perfinfra) {
-            _qnn_htp_perfinfra->setPowerConfig(_qnn_htp_powerconfig_id, power_configs);
-        }
-    }
-    return 0;
-}
-
-int qnn_instance::set_high_performance_mode() {
-    if (nullptr == _qnn_htp_perfinfra) {
-        GGMLQNN_LOG_DEBUG("perf intra is null\n");
-        return 1;
-    }
-
-    QnnHtpPerfInfrastructure_PowerConfig_t power_config;
-    memset(&power_config, 0, sizeof(power_config));
-    power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
-    power_config.dcvsV3Config.dcvsEnable = 0;
-    power_config.dcvsV3Config.setDcvsEnable = 1;
-    power_config.dcvsV3Config.contextId = _qnn_htp_powerconfig_id;
-    power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
-    power_config.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False
-    power_config.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False
-    power_config.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False
-    power_config.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable
-    power_config.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False
-    // set Sleep latency parameter
-    uint32_t latencyValue = 40;
-    power_config.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec
-    // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
-    power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
-    power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    // set power config with different performance parameters
-    const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&power_config, nullptr};
-
-    _qnn_htp_perfinfra->setPowerConfig(_qnn_htp_powerconfig_id, power_configs);
-
-    return 0;
-}
-
 void qnn_instance::htp_print_info() {
     const QnnDevice_PlatformInfo_t * p_info = nullptr;
     _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
@@ -2922,7 +3068,7 @@ void qnn_instance::htp_print_info() {
     _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
 }
 
-void qnn_instance::htp_probe_device_meminfo() {
+void qnn_instance::htp_probe_rpc_meminfo() {
     size_t candidate_size   = 0;
     uint8_t * rpc_buffer    = nullptr;
     const int SIZE_IN_MB    = (1 << 20);
@@ -2976,24 +3122,96 @@ void qnn_instance::print_backend_info() {
     print_property("Qnn group device", QNN_PROPERTY_GROUP_DEVICE);
 }
 
-void qnn_instance::htp_set_memory_grow_size(size_t size) {
-    QnnHtpPerfInfrastructure_MemoryConfig_t grow_size_config = {
-            .option            = QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE,
-            .memGrowSizeConfig = (uint32_t)size,
-    };
+void qnn_instance::htp_set_memory_grow_size(size_t size) {
+    QnnHtpPerfInfrastructure_MemoryConfig_t grow_size_config = {
+            .option            = QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE,
+            .memGrowSizeConfig = (uint32_t)size,
+    };
+
+    const QnnHtpPerfInfrastructure_MemoryConfig_t *memory_config[] = {
+            &grow_size_config,
+            nullptr,
+    };
+    Qnn_ErrorHandle_t ret = _qnn_htp_perfinfra->setMemoryConfig(_qnn_htp_device_id, _qnn_htp_core_id, memory_config);
+    if (ret != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to set HTP memory config");
+    } else {
+        GGMLQNN_LOG_INFO("succeed to set HTP memory config");
+    }
+}
+
+void qnn_instance::htp_set_n_hvx_threads(size_t n_threads) {
+    QnnHtpGraph_CustomConfig_t htp_hvx_thread_config = {
+            .option        = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS,
+            .numHvxThreads = n_threads,
+    };
+
+    QnnGraph_Config_t hvx_thread_config = {
+            .option       = QNN_GRAPH_CONFIG_OPTION_CUSTOM,
+            .customConfig = &htp_hvx_thread_config,
+    };
+
+    const QnnGraph_Config_t * graph_configs[] = {&hvx_thread_config, nullptr};
+    Qnn_ErrorHandle_t ret     = _qnn_raw_interface.graphSetConfig(_qnn_graph_handle, graph_configs);
+    if (ret != QNN_SUCCESS) {
+        GGMLQNN_LOG_WARN("failed to set QNN graph config: set hvx threads %d", n_threads);
+    } else {
+        GGMLQNN_LOG_INFO("succeed to set QNN graph config: set hvx threads %d", n_threads);
+    }
+}
+
+int qnn_instance::htp_set_rpc_polling() {
+    if (_qnn_rpc_pollingtime > 0) {
+        QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingtime;
+        memset(&rpc_pollingtime, 0, sizeof(rpc_pollingtime));
+        rpc_pollingtime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
+        rpc_pollingtime.rpcPollingTimeConfig = _qnn_rpc_pollingtime;
+        const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&rpc_pollingtime, nullptr};
+        if (_qnn_htp_perfinfra) {
+            _qnn_htp_perfinfra->setPowerConfig(_qnn_htp_powerconfig_id, power_configs);
+        }
+    }
+    return 0;
+}
+
+int qnn_instance::htp_set_high_performance_mode() {
+    if (nullptr == _qnn_htp_perfinfra) {
+        GGMLQNN_LOG_DEBUG("perf intra is null\n");
+        return 1;
+    }
+
+    QnnHtpPerfInfrastructure_PowerConfig_t power_config;
+    memset(&power_config, 0, sizeof(power_config));
+    power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
+    power_config.dcvsV3Config.dcvsEnable = 0;
+    power_config.dcvsV3Config.setDcvsEnable = 1;
+    power_config.dcvsV3Config.contextId = _qnn_htp_powerconfig_id;
+    power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
+    power_config.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False
+    power_config.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False
+    power_config.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False
+    power_config.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable
+    power_config.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False
+    // set Sleep latency parameter
+    uint32_t latencyValue = 40;
+    power_config.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec
+    // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
+    power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
+    power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
+    // set power config with different performance parameters
+    const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&power_config, nullptr};
+
+    _qnn_htp_perfinfra->setPowerConfig(_qnn_htp_powerconfig_id, power_configs);
 
-    const QnnHtpPerfInfrastructure_MemoryConfig_t *memory_config[] = {
-            &grow_size_config,
-            nullptr,
-    };
-    Qnn_ErrorHandle_t ret = _qnn_htp_perfinfra->setMemoryConfig(_qnn_htp_device_id, _qnn_htp_core_id, memory_config);
-    if (ret != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to set HTP memory config");
-    } else {
-        GGMLQNN_LOG_INFO("succeed to set HTP memory config");
-    }
+    return 0;
 }
 
+//TODO: merge code between this function and htp_set_rpc_polling,htp_set_high_performance_mode
 void qnn_instance::htp_enter_performance_mode() {
     QnnHtpPerfInfrastructure_PowerConfig_t dcvs_v3_config = {
             .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3,
@@ -3046,7 +3264,7 @@ void qnn_instance::htp_enter_performance_mode() {
             .rpcPollingTimeConfig = 9999,
     };
 
-    const QnnHtpPerfInfrastructure_PowerConfig_t *power_configs[] = {
+    const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {
             &dcvs_v3_config,
             &hmx_config,
             &rpc_ctrl_config,
@@ -3061,26 +3279,6 @@ void qnn_instance::htp_enter_performance_mode() {
     }
 }
 
-void qnn_instance::htp_set_n_hvx_threads(size_t n_threads) {
-    QnnHtpGraph_CustomConfig_t htp_hvx_thread_config = {
-            .option        = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS,
-            .numHvxThreads = n_threads,
-    };
-
-    QnnGraph_Config_t hvx_thread_config = {
-            .option       = QNN_GRAPH_CONFIG_OPTION_CUSTOM,
-            .customConfig = &htp_hvx_thread_config,
-    };
-
-    const QnnGraph_Config_t * graph_configs[] = {&hvx_thread_config, nullptr};
-    Qnn_ErrorHandle_t ret     = _qnn_raw_interface.graphSetConfig(_qnn_graph_handle, graph_configs);
-    if (ret != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to set QNN graph config: set hvx threads %d", n_threads);
-    } else {
-        GGMLQNN_LOG_INFO("succeed to set QNN graph config: set hvx threads %d", n_threads);
-    }
-}
-
 static uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) {
     if (nullptr == instance || nullptr == ggml_tensor || nullptr == qnn_tensor) {
         GGMLQNN_LOG_WARN("invalid params\n");
@@ -3100,36 +3298,45 @@ static uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_t
     return qnn_rpcbuffer;
 }
 
-static void ggmlqnn_print_tensors_info(const char * func_name, ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    //skip sanity check of params because of performance concern
-    if (nullptr != func_name && nullptr != ctx) {
-        GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
+static void ggmlqnn_load_cfg() {
+    char time_string[GGML_QNN_TMPBUF_LEN];
+    memset(time_string, 0, GGML_QNN_TMPBUF_LEN);
+    ggmlqnn_get_timestring(time_string);
+    GGMLQNN_LOG_DEBUG("program running start time:%s", time_string);
+    std::string cfg_filename = std::string(g_qnn_params.qnn_runtimelib_path) + std::string(g_qnn_params.qnn_cfgfilename);
+    GGMLQNN_LOG_INFO("load ggml-qnn config from %s", cfg_filename.c_str());
+    qnn_cfg qnncfg_instance;
+    qnncfg_instance.load(cfg_filename);
+    qnncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
+        std::ostringstream  tmposs;
+        tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]" << std::endl;
+        GGMLQNN_LOG_INFO("%s", tmposs.str().c_str());
+    });
+    std::string precision_mode;
+    qnncfg_instance.get_intvalue("general", "print_qnn_internal_log", g_qnn_params.print_qnn_internal_log, 0);
+    qnncfg_instance.get_intvalue("general", "enable_perf", g_qnn_params.enable_perf, 0);
+    qnncfg_instance.get_intvalue("general", "print_tensors_info", g_qnn_params.print_tensors_info, 0);
+    qnncfg_instance.get_intvalue("general", "dump_op_info", g_qnn_params.dump_op_info, 0);
+    qnncfg_instance.get_intvalue("general", "inference_approach", g_qnn_params.inference_approach, 0);
+    qnncfg_instance.get_intvalue("general", "qnn_backend", g_qnn_params.qnn_backend, 2);
+    qnncfg_instance.get_intvalue("npu", "hvx_threads", g_qnn_params.hvx_threads, 4);
+    qnncfg_instance.get_intvalue("npu", "vtcm_size_in_mb", g_qnn_params.vtcm_size_in_mb, 8);
+    qnncfg_instance.get_intvalue("npu", "enable_dlbc", g_qnn_params.enable_dlbc, 0);
+    qnncfg_instance.get_stringvalue("npu", "precision_mode", precision_mode, "fp32");
+    GGMLQNN_LOG_INFO("print_qnn_internal_log=%d", g_qnn_params.print_qnn_internal_log);
+    GGMLQNN_LOG_INFO("inference_approach=%d", g_qnn_params.inference_approach);
+    GGMLQNN_LOG_INFO("qnn_backend=%d", g_qnn_params.qnn_backend);
+    GGMLQNN_LOG_INFO("npu inference precision mode=%s", precision_mode.c_str());
+    GGMLQNN_LOG_INFO("qnn runtime lib path=%s", g_qnn_params.qnn_runtimelib_path);
+    if (precision_mode.find("fp16") != std::string::npos) {
+        g_qnn_params.precision_mode = 1;
+    } else {
+        g_qnn_params.precision_mode = 0;
     }
-    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
-                      src0->name,
-                      src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-                      src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
-    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
-                      src1->name,
-                      src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
-                      src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
-    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
-                      dst->name,
-                      dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
-                      dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
-    GGMLQNN_LOG_DEBUG("\n");
-}
-
-static void ggmlqnn_dump_op_info(const struct ggml_tensor * tensor) {
-    //skip sanity check of params because of performance concern
-    const struct ggml_tensor * src0 = tensor->src[0];
-    struct ggml_tensor       * src1 = tensor->src[1];
-    struct ggml_tensor       * dst  = const_cast<ggml_tensor *>(tensor);
-    GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), ggml_type_name(tensor->type));
-    ggmlqnn_print_tensors_info(nullptr, nullptr, src0, src1, dst);
 }
 
-static Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor, const char * name,
+static Qnn_Tensor_t * ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
+                                                    const ggml_tensor * tensor, const char * name,
                                                     Qnn_TensorType_t qnn_tensor_type,
                                                     Qnn_DataType_t qnn_data_type,
                                                     uint32_t rank, uint32_t * dims,
@@ -3140,12 +3347,12 @@ static Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor,
 
     //ensure the tensor name is unique
     if (nullptr == name) {
-        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", ggmqnn_get_tensoridx());
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", ggmlqnn_get_tensoridx());
     } else {
-        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, ggmqnn_get_tensoridx());
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, ggmlqnn_get_tensoridx());
     }
-    GGMLQNN_LOG_DEBUG("init_tensor %d", ggmqnn_get_tensoridx());
-    ggmqnn_inc_tensoridx();
+    GGMLQNN_LOG_DEBUG("init_tensor %s", tensor_name);
+    ggmlqnn_inc_tensoridx();
 
     uint32_t reverse_dims[GGML_MAX_DIMS]    = {};
     uint32_t transpose_dims[GGML_MAX_DIMS]  = {};
@@ -3196,9 +3403,7 @@ static Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor,
             }
             }
     };
-    if (nullptr != name) {
-        QNN_VER_PTR(qnn_tensor)->name = name;
-    }
+
     Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
     if (nullptr == p_qnn_tensor) {
         GGMLQNN_LOG_WARN("calloc failed");
@@ -3210,12 +3415,22 @@ static Qnn_Tensor_t * ggmlqnn_create_general_tensor(const ggml_tensor * tensor,
         GGMLQNN_LOG_WARN("init tensor failed");
         return  nullptr;
     }
-    QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {data, data_size};
+
+    bool enable_npu_rpc = (instance->enable_qnn_rpc() && instance->get_device_id() == QNN_BACKEND_NPU);
+    if (enable_npu_rpc) {
+        QNN_VER_PTR(*p_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+        QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {.data=nullptr, .dataSize=0};
+    } else {
+        QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {data, data_size};
+    }
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = instance->get_qnn_raw_interface();
+    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_qnn_tensor));
 
     return p_qnn_tensor;
 }
 
-static Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle, const ggml_tensor * tensor, Qnn_TensorType_t tensor_type) {
+static Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
+                          const ggml_tensor * tensor, Qnn_TensorType_t tensor_type) {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     uint32_t dimensions[]   = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
                                (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
@@ -3233,140 +3448,149 @@ static Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn
     }
 
     qnn_data_type = ggmlqnn_datatype_from_ggml_datatype(tensor->type);
-    Qnn_Tensor_t * p_qnn_tensor = ggmlqnn_create_general_tensor(tensor, nullptr,
-                                                                qnn_tensor_type, qnn_data_type,
-                                                                ggml_n_dims(tensor), dimensions,
-                                                                nullptr, 0);
-
-    bool enable_npu_rpc = (instance->enable_qnn_rpc() && instance->get_device_id() == QNN_BACKEND_NPU);
-    if (enable_npu_rpc) {
-        QNN_VER_PTR(*p_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-        QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {.data=nullptr, .dataSize=0};
-    }
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = instance->get_qnn_raw_interface();
-    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_qnn_tensor));
-
+    Qnn_Tensor_t * p_qnn_tensor = ggmlqnn_create_general_tensor(instance, graph_handle, tensor, nullptr,
+                                      qnn_tensor_type, qnn_data_type,
+                                      ggml_n_dims(tensor), dimensions,
+                                      nullptr, 0);
     return p_qnn_tensor;
 }
 
 // =================================================================================================
 //  section-8: implementation of ggml-qnn backend
 // =================================================================================================
-//TODO: refine this function as it is a performance hotspot/bottleneck function
-static bool ggml_qnn_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * tensor) {
-    if (tensor->op == GGML_OP_NONE) {
-        return true;
+static bool ggmlqnn_same_types(const ggml_backend_qnn_context * ctx, const ggml_tensor * op_tensor) {
+    GGML_UNUSED(ctx);
+    ggml_tensor * src0 = op_tensor->src[0];
+    ggml_tensor * src1 = op_tensor->src[1];
+    if (nullptr != src1) {
+        if (src0->type != op_tensor->type || src1->type != op_tensor->type) {
+            return false;
+        }
+    } else {
+        if (src0->type != op_tensor->type) {
+            return false;
+        }
     }
-    if (ggml_is_empty(tensor) || tensor->op == GGML_OP_RESHAPE
-        || tensor->op == GGML_OP_TRANSPOSE
-        || tensor->op == GGML_OP_VIEW
-        || tensor->op == GGML_OP_PERMUTE
-        ) {
+    if (src0->type != GGML_TYPE_F32)
         return false;
+    return true;
+}
+
+static bool ggmlqnn_compute_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * op_tensor) {
+    if (op_tensor->op == GGML_OP_NONE) {
+        return true;
     }
 
-    //TODO: add other op here
-    bool supported_op = ((tensor->op == GGML_OP_ADD)
-                         || (tensor->op == GGML_OP_MUL_MAT)
-                         || (tensor->op == GGML_OP_MUL)
-                        );
-    if (!supported_op) {
+    if (!ggmlqnn_k_op_caps[ggmlqnn_get_op_index(op_tensor)].supported) {
         return false;
     }
 
-    struct ggml_tensor * src0 = tensor->src[0];
-    struct ggml_tensor * src1 = tensor->src[1];
+    struct ggml_tensor * src0 = op_tensor->src[0];
+    struct ggml_tensor * src1 = op_tensor->src[1];
 
-    const int64_t ne00  = tensor->src[0]->ne[0];
-    const int64_t ne01  = tensor->src[0]->ne[1];
+    const int64_t ne00  = op_tensor->src[0]->ne[0];
+    const int64_t ne01  = op_tensor->src[0]->ne[1];
+    const int64_t ne0   = op_tensor->ne[0];
+    const int64_t ne1   = op_tensor->ne[1];
 
-    const int64_t ne10  = tensor->src[1]->ne[0];
-    const int64_t ne11  = tensor->src[1]->ne[1];
-
-    const int64_t ne0   = tensor->ne[0];
-    const int64_t ne1   = tensor->ne[1];
-
-    const uint32_t src0_rank = ggml_n_dims(src0);
-    const uint32_t src1_rank = ggml_n_dims(src1);
+    uint32_t src0_rank  = ggml_n_dims(src0);
+    uint32_t src1_rank  = 0;
+    if (nullptr != src1) {
+        src1_rank = ggml_n_dims(src1);
+    }
     GGML_UNUSED(ne01);
-    GGML_UNUSED(ne10);
-    GGML_UNUSED(ne11);
     GGML_UNUSED(ne0);
     GGML_UNUSED(ne1);
+    switch (op_tensor->op) {
+        case GGML_OP_ADD:
+        case GGML_OP_SUB:
+        {
+            //ggmlqnn_dump_op_info(op_tensor);
+            if (!ggml_are_same_shape(src0, src1)) {
+                return false;
+            }
 
-    if (tensor->op == GGML_OP_ADD) {
-        //ggmlqnn_dump_op_info(tensor);
-        if (!ggml_are_same_shape(src0, src1)) {
-            return false;
+            if (ne00 < 32)
+                return false;
+
+            return ggmlqnn_same_types(ctx, op_tensor);
         }
-        if (ne00 < 32)
-            return false;
-        return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32);
-    }
 
-    if (tensor->op == GGML_OP_MUL_MAT) {
-        //ggmlqnn_dump_op_info(tensor);
-        if (src0_rank != src1_rank) // make QNN SDK happy
-            return false;
-        if (src0_rank < 2) // QNN's limitation, make QNN SDK happy
-            return false;
-        if (4 == src0_rank) //TODO: 4D matrix mulmat in CT
-            return false;
-        if ((src1->ne[2] != src0->ne[2]) || (src1->ne[3] != src0->ne[3])) // make QNN SDK happy
-            return false;
+        case GGML_OP_DIV:
+        case GGML_OP_MUL: {
+            //ggmlqnn_dump_op_info(op_tensor);
+            if (ctx->device == QNN_BACKEND_NPU)
+                return false;
 
-        if (ctx->device == QNN_BACKEND_NPU)
-            if (2 == src0_rank)
-                return (src0->type == GGML_TYPE_F32
-                    || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
-                    || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
-                   ) && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
-           else
-                return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
-        else
-            return (src0->type == GGML_TYPE_F32   || ggml_is_quantized(src0->type))
-                    && (src1->type == GGML_TYPE_F32) && (tensor->type == GGML_TYPE_F32);
-    }
+            if (!ggml_are_same_shape(src0, src1)) {
+                return false;
+            }
 
-    if (tensor->op == GGML_OP_MUL) {
-        //ggmlqnn_dump_op_info(tensor);
-        if (ctx->device == QNN_BACKEND_NPU)
-            return false;
-        if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix
-            return false;
-        return  (src0->type == GGML_TYPE_F32)
-                && (src1->type == GGML_TYPE_F32)
-                && (tensor->type == src1->type);
-    }
+            if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix
+                return false;
 
-    return false;
+            return ggmlqnn_same_types(ctx, op_tensor);
+        }
+        case GGML_OP_MUL_MAT:
+        {
+            ggmlqnn_dump_op_info(op_tensor);
+            if (src0_rank != src1_rank) // make QNN SDK happy
+                return false;
+
+            if (src0_rank < 2) // QNN's limitation, make QNN SDK happy
+                return false;
+
+            if (4 == src0_rank) //TODO: 4D matrix mulmat in CT
+                return false;
+
+            if ((src1->ne[2] != src0->ne[2]) || (src1->ne[3] != src0->ne[3])) // make QNN SDK happy
+                return false;
+
+            if (ctx->device == QNN_BACKEND_NPU) {
+                return (src0->type == GGML_TYPE_F32
+                        || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
+                        || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
+                        ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+            } else {
+                return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type))
+                        && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+            }
+        }
+        case GGML_OP_LOG:
+        {
+            if (ctx->device == QNN_BACKEND_NPU)
+                return false;
+        }
+        case GGML_OP_SQRT:
+        default:
+            return ggmlqnn_same_types(ctx, op_tensor);
+    }
 }
 
-static bool ggml_qnn_compute_forward(ggml_backend_t backend, struct ggml_tensor * dst) {
-    ggmlqnn_op_func_t func                = nullptr;
-    ggml_backend_qnn_context * ctx        = (ggml_backend_qnn_context *)backend->context;
+static bool ggmlqnn_compute_compute_forward(ggml_backend_t backend, struct ggml_tensor * dst) {
+    ggmlqnn_op_func_t func          = nullptr;
+    ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *)backend->context;
 
     switch (dst->op) {
         case GGML_OP_REPEAT:
-            ggml_qnn_repeat(ctx, dst);
+            ggmlqnn_compute_repeat(ctx, dst);
             break;
         case GGML_OP_GET_ROWS:
-            ggml_qnn_get_rows(ctx, dst);
+            ggmlqnn_compute_get_rows(ctx, dst);
             break;
         case GGML_OP_DUP:
-            ggml_qnn_dup(ctx, dst);
+            ggmlqnn_compute_dup(ctx, dst);
             break;
         case GGML_OP_ADD:
-            func = ggml_qnn_general_node;
-            break;
-        case GGML_OP_ACC:
-            ggml_qnn_acc(ctx, dst);
-            break;
+        case GGML_OP_SUB:
         case GGML_OP_MUL:
-            func = ggml_qnn_general_node;
-            break;
         case GGML_OP_DIV:
-            ggml_qnn_div(ctx, dst);
+        case GGML_OP_SQRT:
+        case GGML_OP_LOG:
+            func = ggmlqnn_compute_elementwise;
+            break;
+        case GGML_OP_ACC:
+            ggmlqnn_compute_acc(ctx, dst);
             break;
         case GGML_OP_UNARY:
             switch (ggml_get_unary_op(dst)) {
@@ -3389,51 +3613,51 @@ static bool ggml_qnn_compute_forward(ggml_backend_t backend, struct ggml_tensor
             }
             break;
         case GGML_OP_NORM:
-            ggml_qnn_norm(ctx, dst);
+            ggmlqnn_compute_norm(ctx, dst);
             break;
         case GGML_OP_GROUP_NORM:
-            ggml_qnn_group_norm(ctx, dst);
+            ggmlqnn_compute_group_norm(ctx, dst);
             break;
         case GGML_OP_CONCAT:
-            ggml_qnn_concat(ctx, dst);
+            ggmlqnn_compute_concat(ctx, dst);
             break;
         case GGML_OP_UPSCALE:
-            ggml_qnn_upsample_nearest2d(ctx, dst);
+            ggmlqnn_compute_upsample_nearest2d(ctx, dst);
             break;
         case GGML_OP_PAD:
-            ggml_qnn_pad(ctx, dst);
+            ggmlqnn_compute_pad(ctx, dst);
             break;
         case GGML_OP_ARANGE:
-            ggml_qnn_arange(ctx, dst);
+            ggmlqnn_compute_arange(ctx, dst);
             break;
         case GGML_OP_TIMESTEP_EMBEDDING:
-            ggml_qnn_timestep_embedding(ctx, dst);
+            ggmlqnn_compute_timestep_embedding(ctx, dst);
             break;
         case GGML_OP_LEAKY_RELU:
-            ggml_qnn_leaky_relu(ctx, dst);
+            ggmlqnn_compute_leaky_relu(ctx, dst);
             break;
         case GGML_OP_RMS_NORM:
-            ggml_qnn_rms_norm(ctx, dst);
+            ggmlqnn_compute_rms_norm(ctx, dst);
             break;
         case GGML_OP_MUL_MAT:
-            ggml_qnn_mul_mat(ctx, dst);
+            ggmlqnn_compute_mul_mat(ctx, dst);
             break;
         case GGML_OP_MUL_MAT_ID:
             return false;
         case GGML_OP_SCALE:
-            ggml_qnn_scale(ctx, dst);
+            ggmlqnn_compute_scale(ctx, dst);
             break;
         case GGML_OP_SQR:
-            ggml_qnn_sqr(ctx, dst);
+            ggmlqnn_compute_sqr(ctx, dst);
             break;
         case GGML_OP_CLAMP:
-            ggml_qnn_clamp(ctx, dst);
+            ggmlqnn_compute_clamp(ctx, dst);
             break;
         case GGML_OP_CPY:
-            ggml_qnn_cpy(ctx, dst);
+            ggmlqnn_compute_cpy(ctx, dst);
             break;
         case GGML_OP_CONT:
-            ggml_qnn_dup(ctx, dst);
+            ggmlqnn_compute_dup(ctx, dst);
             break;
         case GGML_OP_NONE:
         case GGML_OP_RESHAPE:
@@ -3442,25 +3666,25 @@ static bool ggml_qnn_compute_forward(ggml_backend_t backend, struct ggml_tensor
         case GGML_OP_TRANSPOSE:
             break;
         case GGML_OP_DIAG_MASK_INF:
-            ggml_qnn_diag_mask(ctx, dst, -INFINITY);
+            ggmlqnn_compute_diag_mask(ctx, dst, -INFINITY);
             break;
         case GGML_OP_SOFT_MAX:
-            ggml_qnn_softmax(ctx, dst);
+            ggmlqnn_compute_softmax(ctx, dst);
             break;
         case GGML_OP_ROPE:
-            ggml_qnn_rope(ctx, dst);
+            ggmlqnn_compute_rope(ctx, dst);
             break;
         case GGML_OP_IM2COL:
-            ggml_qnn_im2col(ctx, dst);
+            ggmlqnn_compute_im2col(ctx, dst);
             break;
         case GGML_OP_POOL_2D:
-            ggml_qnn_pool2d(ctx, dst);
+            ggmlqnn_compute_pool2d(ctx, dst);
             break;
         case GGML_OP_SUM_ROWS:
-            ggml_qnn_sum_rows(ctx, dst);
+            ggmlqnn_compute_sum_rows(ctx, dst);
             break;
         case GGML_OP_ARGSORT:
-            ggml_qnn_argsort(ctx, dst);
+            ggmlqnn_compute_argsort(ctx, dst);
             break;
         default:
             return false;
@@ -3472,6 +3696,7 @@ static bool ggml_qnn_compute_forward(ggml_backend_t backend, struct ggml_tensor
     return true;
 }
 
+//TODO: refine this data structure
 struct ggml_backend_qnn_buffer_context {
     ~ggml_backend_qnn_buffer_context() {
         if (buffer) {
@@ -3572,7 +3797,7 @@ static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t
 }
 
 static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(
-                                  ggml_backend_buffer_type_t buft, size_t size) {
+           ggml_backend_buffer_type_t buft, size_t size) {
     ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context;
 
     size_t size_page = 0;
@@ -3623,24 +3848,37 @@ static const char * ggml_backend_qnn_name(ggml_backend_t backend) {
 static void ggml_backend_qnn_free(ggml_backend_t backend) {
     GGMLQNN_LOG_DEBUG("enter %s", __func__ );
     ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context;
-    GGMLQNN_LOG_DEBUG("idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name);
+    GGMLQNN_LOG_DEBUG("device idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name);
 
     qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance;
     if (instance != nullptr) {
-        std::map<std::string, std::tuple<Qnn_GraphHandle_t, std::vector<Qnn_Tensor_t*>>>::iterator graph_it;
-
-        for (graph_it = instance->_qnn_graph_map.begin();
-             graph_it != instance->_qnn_graph_map.end(); graph_it++) {
-            auto & graph_item = graph_it->second;
-            Qnn_GraphHandle_t & graph_handle = std::get<0>(graph_item);
-            qnn_tensors_t &  tensors = std::get<1>(graph_item);
-            for (auto tensor_it = tensors.begin(); tensor_it != tensors.end(); ++tensor_it) {
+        std::map<std::string, qnn_singlenode_res_t>::iterator singlenode_graph_it;
+        for (singlenode_graph_it = ctx->qnn_singlenode_graph_map.begin();
+             singlenode_graph_it != ctx->qnn_singlenode_graph_map.end(); singlenode_graph_it++) {
+            auto & graph_res = singlenode_graph_it->second;
+            Qnn_GraphHandle_t & graph_handle    = std::get<0>(graph_res);
+            qnn_ptensors_t    & ptensors        = std::get<1>(graph_res);
+            for (auto tensor_it = ptensors.begin(); tensor_it != ptensors.end(); ++tensor_it) {
                 free_qnn_tensor(*tensor_it);
             }
             GGML_UNUSED(graph_handle);
-            GGMLQNN_LOG_DEBUG("graph type:%s", graph_it->first.c_str());
+            GGMLQNN_LOG_DEBUG("clean up graph:%s", singlenode_graph_it->first.c_str());
         }
-        instance->_qnn_graph_map.clear();
+        ctx->qnn_singlenode_graph_map.clear();
+
+        std::map<std::string, qnn_multinode_res_t>::iterator multinode_graph_it;
+        for (multinode_graph_it = ctx->qnn_multinode_graph_map.begin();
+             multinode_graph_it != ctx->qnn_multinode_graph_map.end(); multinode_graph_it++) {
+            auto & graph_res = multinode_graph_it->second;
+            Qnn_GraphHandle_t & graph_handle    = std::get<0>(graph_res);
+            qnn_ptensors_t   &  ptensors        = std::get<2>(graph_res);
+            for (auto tensor_it = ptensors.begin(); tensor_it != ptensors.end(); ++tensor_it) {
+                free_qnn_tensor(*tensor_it);
+            }
+            GGML_UNUSED(graph_handle);
+            GGMLQNN_LOG_DEBUG("clean up graph:%s", multinode_graph_it->first.c_str());
+        }
+        ctx->qnn_multinode_graph_map.clear();
 
         instance->qnn_finalize();
         delete instance;
@@ -3654,29 +3892,31 @@ static void ggml_backend_qnn_free(ggml_backend_t backend) {
     GGMLQNN_LOG_DEBUG("leave %s", __func__ );
 }
 
-static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+//this is the first tech approach(or general approach in other ggml backends, such as ggml-sycl or ggml-cann)
+static enum ggml_status ggmlqnn_backend_graph_compute_general(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     enum ggml_status result         = GGML_STATUS_SUCCESS;
-    ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *) backend->context;
+    ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *)backend->context;
     GGML_UNUSED(ctx);
-    //GGMLQNN_LOG_DEBUG("device %d", ctx->device);
-    //GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
-
-    if (0 == g_inference_approach) {
-        for (int i = 0; i < cgraph->n_nodes; i++) {
-            ggml_tensor * node = cgraph->nodes[i];
-            if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE
-                || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW
-                || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
-                continue;
-            }
-            bool ok = ggml_qnn_compute_forward(backend, node);
-            if (!ok) {
-                GGMLQNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
-            }
+#if 0
+    GGMLQNN_LOG_DEBUG("device %d", ctx->device);
+    GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
+    int num_nodes = std::min(5, cgraph->n_nodes);
+    for (int i = 0; i < num_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+        GGMLQNN_LOG_DEBUG("%s: op %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+    }
+#endif
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE
+            || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW
+            || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+            continue;
+        }
+        bool ok = ggmlqnn_compute_compute_forward(backend, node);
+        if (!ok) {
+            GGMLQNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
         }
-    } else {
-        //offload entire cgraph to QNN CPU & GPU & NPU
-        return ggmlqnn_graph_compute(backend, cgraph);
     }
 
     return result;
@@ -3765,10 +4005,32 @@ static void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev,
 
 static ggml_backend_t ggml_backend_qnn_device_init_backend(ggml_backend_dev_t dev, const char * params) {
     GGML_UNUSED(dev);
+    GGMLQNN_LOG_INFO("enter %s\n", __func__);
+    size_t dev_index = 0;
+
+    //case-1: special scenario, such as test-backend-ops or other similar scenairo: calling ggml_backend_qnn_device_init_backend directly in user's applicaton
+    //call ggmlqnn_load_cfg accordingly in this place
+    ggmlqnn_load_cfg();
+    GGMLQNN_LOG_INFO("user's specified qnn_backend in cfgfile = %d", g_qnn_params.qnn_backend);
+    GGMLQNN_LOG_INFO("user's sepcified qnn runtime lib path in cfgfile = %s", g_qnn_params.qnn_runtimelib_path);
+
     if (nullptr == params) {
-        params = 0;
+        GGMLQNN_LOG_INFO("program specified param is nullptr\n");
+        dev_index = (g_qnn_params.qnn_backend > 0) ? g_qnn_params.qnn_backend : 0;
+        if (dev_index >= GGML_QNN_MAX_DEVICES) {
+            GGMLQNN_LOG_INFO("assume the default ggml backend\n");
+            return nullptr;
+        }
+    } else {
+        GGMLQNN_LOG_INFO("program specified param is not nullptr\n");
+        //user's program calling ggml_backend_qnn_device_init_backend directly
+        dev_index = (int)(intptr_t)params;
+        g_qnn_params.qnn_backend = dev_index;
+        GGMLQNN_LOG_INFO("program specified dev_index %d\n", dev_index);
     }
-    ggml_backend_t qnn_backend = ggml_backend_qnn_init((int)(intptr_t)params, g_qnn_runtimelib_path);
+    GGMLQNN_LOG_INFO("qnn_backend=%d", dev_index);
+    ggml_backend_t qnn_backend = ggml_backend_qnn_init(dev_index, g_qnn_params.qnn_runtimelib_path);
+    GGMLQNN_LOG_INFO("leave %s\n", __func__);
 
     return qnn_backend;
 
@@ -3812,7 +4074,7 @@ static ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_host_ptr(ggml_b
 
 static bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
     ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) dev->context;
-    return (ggml_qnn_can_handle_op(ctx,op));
+    return (ggmlqnn_compute_can_handle_op(ctx,op));
 }
 
 static bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
@@ -3849,7 +4111,7 @@ static ggml_backend_i ggml_backend_qnn_interface = {
         /* .graph_plan_free         = */ nullptr,
         /* .graph_plan_update       = */ nullptr,
         /* .graph_plan_compute      = */ nullptr,
-        /* .graph_compute           = */ ggml_backend_qnn_graph_compute,
+        /* .graph_compute           = */ nullptr,
         /* .event_record            = */ nullptr,
         /* .event_wait              = */ nullptr,
 };
@@ -3909,7 +4171,6 @@ static void * ggml_backend_qnn_reg_get_proc_address(ggml_backend_reg_t reg, cons
         return nullptr;
 
     const char * slot_name =  "ggml_backend_set_n_threads";
-    //avoid buffer attack rather than strcmp
     if (0 == memcmp(name, slot_name, strlen(slot_name))) {
         return (void *)ggml_backend_qnn_set_n_threads;
     }
@@ -3927,6 +4188,17 @@ ggml_backend_reg_t ggml_backend_qnn_reg() {
     static ggml_backend_reg reg;
     static bool initialized = false;
     GGMLQNN_LOG_DEBUG("enter ggml_backend_qnn_reg");
+    //case-2: normal scenario, such as llama-cli or UI applicaton
+    //call ggmlqnn_load_cfg accordingly in this place
+    ggmlqnn_load_cfg();
+    GGMLQNN_LOG_INFO("user's specified qnn_backend=%d", g_qnn_params.qnn_backend);
+    GGMLQNN_LOG_INFO("user's sepcified qnn runtime lib path=%s", g_qnn_params.qnn_runtimelib_path);
+    if (g_qnn_params.qnn_backend >= GGML_QNN_MAX_DEVICES) {
+        GGMLQNN_LOG_INFO("assume default ggml backend\n");
+        GGMLQNN_LOG_DEBUG("leave ggml_backend_qnn_reg");
+        return nullptr;
+    }
+
     {
         static std::mutex mutex;
         std::lock_guard<std::mutex> lock(mutex);
@@ -3965,6 +4237,8 @@ ggml_backend_reg_t ggml_backend_qnn_reg() {
 ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
     int result = 0;
 
+    GGMLQNN_LOG_INFO("enter %s\n", __func__);
+
     if (nullptr == qnn_lib_path)
         return nullptr;
 
@@ -3976,14 +4250,14 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
     }
 
     if (nullptr != g_qnn_mgr[device].backend) {
-        GGMLQNN_LOG_WARN("qnn backend %d(%s) already loaded", device, ggml_backend_qnn_get_devname(device));
+        GGMLQNN_LOG_INFO("qnn backend %d(%s) already loaded", device, ggml_backend_qnn_get_devname(device));
+        GGMLQNN_LOG_INFO("leave %s\n", __func__);
         return g_qnn_mgr[device].backend;
     }
 
-    ggmlqnn_load_cfg();
-
 #if defined(__ANDROID__)
     std::string path = qnn_lib_path;
+    GGMLQNN_LOG_INFO("lib_path %s", path.c_str());
     if (QNN_BACKEND_NPU == device) {
         if (0 == setenv("LD_LIBRARY_PATH",
                         (path +
@@ -4034,13 +4308,21 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
     g_qnn_mgr[device].raw_interface             = instance->get_qnn_raw_interface();
     g_qnn_mgr[device].raw_system_interface      = instance->get_qnn_raw_system_interface();
 
+    if (0 == g_qnn_params.inference_approach) {
+        ggml_backend_qnn_interface.graph_compute = ggmlqnn_backend_graph_compute_general;
+    } else {
+        ggml_backend_qnn_interface.graph_compute = ggmlqnn_backend_graph_compute_special;
+    }
+
     ggml_backend_t qnn_backend = new ggml_backend{
             /* .guid      = */ ggml_backend_qnn_guid(),
             /* .iface     = */ ggml_backend_qnn_interface,
             /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_qnn_reg(), device),
             /* .context   = */ &g_qnn_mgr[device]
     };
+
     g_qnn_mgr[device].backend   = qnn_backend;
+    GGMLQNN_LOG_INFO("leave %s\n", __func__);
 
     return qnn_backend;
 }
@@ -4065,7 +4347,7 @@ static inline uint32_t ggmlqnn_get_tensor_data_size(const ggml_tensor * tensor)
 
 static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
                                            const ggml_tensor * src1, ggml_tensor * dst) {
-    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
+    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == dst)) {
         GGMLQNN_LOG_WARN("invalid params\n");
         return false;
     }
@@ -4080,10 +4362,10 @@ static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const
 }
 
 /*
- * provide a general skeleton to offload ggml op to QNN backend: a single node contains 2 input
- * tensor and 1 output tensor
+ * provide a general skeleton to offload ggml op to QNN backend: peform element-wise operation on 1/2
+ * input tensors and 1 output tensors
 */
-void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
+static void ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
     qnn_instance * instance                     = nullptr;
     Qnn_GraphHandle_t graph_handle              = nullptr;
@@ -4098,51 +4380,62 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     instance                                    = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
     size_t qnn_op_index                         = ggmlqnn_get_op_index(op);
-    GGML_ASSERT(qnn_op_index < ggmlqnn_get_opcaps_size());
     const char * qnn_op_name                    = ggmlqnn_k_op_caps[qnn_op_index].qnn_op_name;
+    size_t input_param_count                    = ggmlqnn_k_op_caps[qnn_op_index].input_param_count;
     std::string ggml_op_name_string             = std::string("ggml_") + ggml_op_name(op->op);
     const char * ggml_op_name                   = ggml_op_name_string.c_str();
 
-    qnn_perf op_perf                            = qnn_perf(ggml_op_name);
-    op_perf.start();
-
-    //ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
     bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
 
     std::string graph_name;
     ggmlqnn_get_graphkey_from_op(op, graph_name);
-    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+
+    qnn_perf op_perf                            = qnn_perf(graph_name);
+    op_perf.start();
+
+    if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
         //retrieve computational resource from cached QNN graph
-        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
-        graph_handle            = std::get<0>(graph_item);
-        qnn_tensors_t & tensor  = std::get<1>(graph_item);
-        p_tensor0               = tensor[0];
-        p_tensor1               = tensor[1];
-        p_tensor2               = tensor[2];
+        qnn_singlenode_res_t & graph_item  = ctx->qnn_singlenode_graph_map[graph_name];
+        graph_handle              = std::get<0>(graph_item);
+        qnn_ptensors_t & ptensors = std::get<1>(graph_item);
+        p_tensor0                 = ptensors[0];
+        if (2 == input_param_count) {
+            p_tensor1 = ptensors[1];
+            p_tensor2 = ptensors[2];
+        } else {
+            //now p_tensor1 is nullptr
+            p_tensor2 = ptensors[1];
+        }
     } else {
         GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str());
         GGML_ASSERT(instance->get_device_id() == ctx->device);
         //create QNN graph
-        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8, 4);
+        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), g_qnn_params.vtcm_size_in_mb, g_qnn_params.hvx_threads);
         if (QNN_SUCCESS != error) {
             GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
             return;
         }
         graph_handle = instance->get_qnn_graph_handle();
 
+        GGMLQNN_LOG_DEBUG("graph_handle %p", graph_handle);
         //create computational tensor
         p_tensor0 = ggmlqnn_create_compute_tensor(instance, graph_handle, src0, QNN_TENSOR_TYPE_APP_WRITE);
-        p_tensor1 = ggmlqnn_create_compute_tensor(instance, graph_handle, src1, QNN_TENSOR_TYPE_APP_WRITE);
+        if (2 == input_param_count) {
+            p_tensor1 = ggmlqnn_create_compute_tensor(instance, graph_handle, src1, QNN_TENSOR_TYPE_APP_WRITE);
+        }
         p_tensor2 = ggmlqnn_create_compute_tensor(instance, graph_handle, dst,  QNN_TENSOR_TYPE_APP_READ);
 
         //compose QNN graph
-        Qnn_Tensor_t tensor_inputs[] = {
-                *p_tensor0,
-                *p_tensor1
-        };
-        Qnn_Tensor_t tensor_outputs[] = {
+        qnn_tensors_t input_tensors;
+        input_tensors.reserve(input_param_count);
+        input_tensors.push_back(*p_tensor0);
+        if (2 == input_param_count) {
+            input_tensors.push_back(*p_tensor1);
+        }
+        Qnn_Tensor_t output_tensors[] = {
                 *p_tensor2
         };
+#if 0 // keep them for understand code easily
         Qnn_OpConfig_t op_config = {
                 QNN_OPCONFIG_VERSION_1, {
                         ggml_op_name,
@@ -4150,86 +4443,98 @@ void ggml_qnn_general_node(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
                         qnn_op_name,
                         0,
                         nullptr,
-                        2,
+                        input_param_count,
                         tensor_inputs,
                         1,
                         tensor_outputs
                 }
         };
+#else
+        Qnn_OpConfig_t op_config        = ggmlqnn_create_op_config(ggml_op_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                                   qnn_op_name, nullptr, 0,
+                                                                   input_tensors.data(), input_param_count, output_tensors, 1);
+#endif
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
         //finalize QNN graph
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
 
         //cache QNN graph
-        qnn_tensors_t ggml_op_add_tensors;
-        ggml_op_add_tensors.reserve(3);
-        ggml_op_add_tensors.push_back(p_tensor0);
-        ggml_op_add_tensors.push_back(p_tensor1);
-        ggml_op_add_tensors.push_back(p_tensor2);
-        auto  graph_item = std::make_tuple(graph_handle, ggml_op_add_tensors);
-        instance->_qnn_graph_map[graph_name] = graph_item;
+        qnn_ptensors_t qnn_elementwise_tensors;
+        qnn_elementwise_tensors.reserve(input_param_count + 1);
+
+        qnn_elementwise_tensors.push_back(p_tensor0);
+        if (2 == input_param_count) {
+            qnn_elementwise_tensors.push_back(p_tensor1);
+        }
+        qnn_elementwise_tensors.push_back(p_tensor2);
+        auto  graph_item = std::make_tuple(graph_handle, qnn_elementwise_tensors);
+        ctx->qnn_singlenode_graph_map[graph_name] = graph_item;
     }
 
     if (enable_npu_rpc) {
         uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
-        GGMLQNN_LOG_INFO("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
+        GGMLQNN_LOG_DEBUG("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
         if (nullptr != qnn_buffer_0) {
             memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
         }
 
-        uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor1)->memHandle));
-        GGMLQNN_LOG_INFO("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
-        if (nullptr != qnn_buffer_1) {
-            memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
+        if (2 == input_param_count) {
+            uint8_t *qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(
+                    QNN_VER_PTR(*p_tensor1)->memHandle));
+            GGMLQNN_LOG_DEBUG("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
+            if (nullptr != qnn_buffer_1) {
+                memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
+            }
         }
     } else {
         QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-        QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+        if (2 == input_param_count) {
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+        }
         QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
     }
 
-    Qnn_Tensor_t tensor_inputs[] = {
-            *p_tensor0,
-            *p_tensor1
-    };
-    Qnn_Tensor_t tensor_outputs[] = {
+    qnn_tensors_t input_tensors;
+    input_tensors.reserve(input_param_count);
+    input_tensors.push_back(*p_tensor0);
+    if (2 == input_param_count) {
+        input_tensors.push_back(*p_tensor1);
+    }
+    Qnn_Tensor_t output_tensors[] = {
             *p_tensor2
     };
     CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                        tensor_inputs, 2,
-                                                        tensor_outputs, 1,
+                                                        input_tensors.data(), input_param_count,
+                                                        output_tensors, 1,
                                                         nullptr, nullptr));
     if (enable_npu_rpc) {
-        //TODO:NPU RPC feature will failed with test-backend-ops
         uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
         if (nullptr != qnn_buffer_2) {
             memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
         }
     }
 
-#if GGMLQNN_PRINT_OP_ADD_LOG
     op_perf.info();
-#endif
 }
 
 /*
  * this function is AI-assisted code from Grok 3 for purpose of offload 4d matrix mulmat to QNN backend
  * various UT has verified and succeed but failed in CT of test-backend-ops
  *
- * the logic of ggml_qnn_mul_mat_4d is similar to ggml_qnn_mul_mat but much more complicated
- * than ggml_qnn_mul_mat, so it's a standalone function.
- * it will be combined with ggml_qnn_mul_mat in the future
+ * the logic of ggmlqnn_compute_mul_mat_4d is similar to ggmlqnn_compute_mul_mat but much more complicated
+ * than ggmlqnn_compute_mul_mat, so it's a standalone function.
+ * it will be combined with ggmlqnn_compute_mul_mat in the future
  */
-static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    bool graph_initialized = false;
-    qnn_perf op_perf = qnn_perf("ggml_qnn_mul_mat_4d");
-    qnn_instance *instance = ctx->instance;
+static void ggmlqnn_compute_mul_mat_4d(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error     = QNN_SUCCESS;
+    bool graph_initialized      = false;
+    qnn_perf op_perf            = qnn_perf("ggmlqnn_compute_mul_mat_4d");
+    qnn_instance * instance     = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
 
-    const ggml_tensor *src0 = op->src[0];
-    const ggml_tensor *src1 = op->src[1];
-    ggml_tensor *dst = op;
+    const ggml_tensor * src0 = op->src[0];
+    const ggml_tensor * src1 = op->src[1];
+    ggml_tensor * dst        = op;
 
     GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
     GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4);
@@ -4241,32 +4546,31 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
 
     ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
 
-    Qnn_GraphHandle_t graph_handle = nullptr;
-    Qnn_Tensor_t *p_tensor0 = nullptr;
-    Qnn_Tensor_t *p_reshape0_out = nullptr;
-    Qnn_Tensor_t *p_tile0_out = nullptr;
-    Qnn_Tensor_t *p_tensor1 = nullptr;
-    Qnn_Tensor_t *p_permute1_out = nullptr;
-    Qnn_Tensor_t *p_reshape1_out = nullptr;
-    Qnn_Tensor_t *p_matmul_out = nullptr;
-    Qnn_Tensor_t *p_reshape2_out = nullptr;
-
-    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+    Qnn_GraphHandle_t graph_handle  = nullptr;
+    Qnn_Tensor_t * p_tensor0        = nullptr;
+    Qnn_Tensor_t * p_reshape0_out   = nullptr;
+    Qnn_Tensor_t * p_tile0_out      = nullptr;
+    Qnn_Tensor_t * p_tensor1        = nullptr;
+    Qnn_Tensor_t * p_permute1_out   = nullptr;
+    Qnn_Tensor_t * p_reshape1_out   = nullptr;
+    Qnn_Tensor_t * p_matmul_out     = nullptr;
+    Qnn_Tensor_t * p_reshape2_out   = nullptr;
+
+    if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
         graph_initialized = true;
-        qnn_res_t &graph_item = instance->_qnn_graph_map[graph_name];
-        graph_handle = std::get<0>(graph_item);
-        qnn_tensors_t &tensors = std::get<1>(graph_item);
-        p_tensor0 = tensors[0];
-        p_reshape0_out = tensors[1];
-        p_tile0_out = tensors[2];
-        p_tensor1 = tensors[3];
-        p_permute1_out = tensors[4];
-        p_reshape1_out = tensors[5];
-        p_matmul_out = tensors[6];
-        p_reshape2_out = tensors[7];
+        qnn_singlenode_res_t & graph_item   = ctx->qnn_singlenode_graph_map[graph_name];
+        graph_handle                        = std::get<0>(graph_item);
+        qnn_ptensors_t & tensors            = std::get<1>(graph_item);
+        p_tensor0                           = tensors[0];
+        p_reshape0_out                      = tensors[1];
+        p_tile0_out                         = tensors[2];
+        p_tensor1                           = tensors[3];
+        p_permute1_out                      = tensors[4];
+        p_reshape1_out                      = tensors[5];
+        p_matmul_out                        = tensors[6];
+        p_reshape2_out                      = tensors[7];
     } else {
-        CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(),
-                                                           graph_name.c_str(), NULL, &graph_handle));
+        CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), NULL, &graph_handle));
 
         // Define dimensions
         uint32_t K = src0->ne[0];               // Inner dimension
@@ -4279,127 +4583,136 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
         GGML_ASSERT(src0->ne[0] == src1->ne[0]); // K must match
 
         // src0: [K, M, H0, B0] -> QNN: [B0, H0, M, K]
-        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]), static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])};
-        p_tensor0 = GQCGT(src0, "input0", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
-                          src0_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
+        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]),
+                                static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])
+        };
+        p_tensor0 = ggmlqnn_create_general_tensor(instance, graph_handle, src0, "input0",
+                        QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
+                        src0_dims, nullptr, 0);
 
         // Reshape src0 to [B0, M, K]
         uint32_t reshape0_out_dims[] = {B0, M, K};
-        p_reshape0_out = GQCGT(nullptr, "reshape0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                               reshape0_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape0_out));
-        Qnn_Tensor_t reshape0_inputs[] = {*p_tensor0};
+        p_reshape0_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "reshape0_out",
+                             QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                             reshape0_out_dims, nullptr, 0);
+
+        Qnn_Tensor_t reshape0_inputs[]  = {*p_tensor0};
         Qnn_Tensor_t reshape0_outputs[] = {*p_reshape0_out};
-        Qnn_OpConfig_t reshape0_op = ggmlqnn_create_op_config("reshape0", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                              QNN_OP_RESHAPE, nullptr, 0,
-                                                              reshape0_inputs, 1, reshape0_outputs, 1);
+        Qnn_OpConfig_t reshape0_op      = ggmlqnn_create_op_config("reshape0", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                              QNN_OP_RESHAPE, nullptr, 0,
+                                              reshape0_inputs, 1, reshape0_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape0_op));
 
         // Tile src0 to match B1: [B0, M, K] -> [B1, M, K]
         uint32_t tile0_out_dims[] = {B1, M, K};
-        p_tile0_out = GQCGT(nullptr, "tile0_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                            tile0_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tile0_out));
+        p_tile0_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "tile0_out",
+                          QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                          tile0_out_dims, nullptr, 0);
+
         uint32_t tile_multiples[] = {B1 / B0, 1, 1};
         uint32_t tile_dims[] = {3};
-        Qnn_Tensor_t *p_tile_multiples = GQCGT(nullptr, "tile_multiples", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
-                                               tile_dims, tile_multiples, sizeof(tile_multiples));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tile_multiples));
-        Qnn_Param_t tile_params[] = {{QNN_PARAMTYPE_TENSOR, "multiples", .tensorParam = *p_tile_multiples}};
-        Qnn_Tensor_t tile0_inputs[] = {*p_reshape0_out};
-        Qnn_Tensor_t tile0_outputs[] = {*p_tile0_out};
-        Qnn_OpConfig_t tile0_op = ggmlqnn_create_op_config("tile0", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                           QNN_OP_TILE, tile_params, 1,
-                                                           tile0_inputs, 1, tile0_outputs, 1);
+        Qnn_Tensor_t * p_tile_multiples = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "tile_multiples",
+                                              QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                              tile_dims, tile_multiples, sizeof(tile_multiples));
+
+        Qnn_Param_t tile_params[]       = {{QNN_PARAMTYPE_TENSOR, "multiples", .tensorParam = *p_tile_multiples}};
+        Qnn_Tensor_t tile0_inputs[]     = {*p_reshape0_out};
+        Qnn_Tensor_t tile0_outputs[]    = {*p_tile0_out};
+        Qnn_OpConfig_t tile0_op         = ggmlqnn_create_op_config("tile0", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                              QNN_OP_TILE, tile_params, 1,
+                                              tile0_inputs, 1, tile0_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, tile0_op));
 
         // src1: [N, K, H1, B1] -> QNN: [B1, H1, N, K]
-        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])};
-        p_tensor1 = GQCGT(src1, "input1", QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
-                          src1_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
+        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]),
+                                static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])
+        };
+        p_tensor1 = ggmlqnn_create_general_tensor(instance, graph_handle, src1, "input1",
+                        QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
+                        src1_dims, nullptr, 0);
+
 
         // Permute src1 to [B1, H1, K, N]
         uint32_t perm_data[] = {0, 1, 3, 2};
         uint32_t perm_dims[] = {4};
-        Qnn_Tensor_t *p_perm = GQCGT(nullptr, "perm", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
-                                     perm_dims, perm_data, sizeof(perm_data));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_perm));
-        uint32_t permute1_out_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]), static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1])};
-        p_permute1_out = GQCGT(nullptr, "permute1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
-                               permute1_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_permute1_out));
-        Qnn_Param_t permute1_params[] = {{QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_perm}};
-        Qnn_Tensor_t permute1_inputs[] = {*p_tensor1};
+        Qnn_Tensor_t * p_perm = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "perm",
+                                    QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                    perm_dims, perm_data, sizeof(perm_data));
+
+        uint32_t permute1_out_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]),
+                                        static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1])
+        };
+        p_permute1_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "permute1_out",
+                             QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
+                             permute1_out_dims, nullptr, 0);
+
+        Qnn_Param_t permute1_params[]   = {{QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_perm}};
+        Qnn_Tensor_t permute1_inputs[]  = {*p_tensor1};
         Qnn_Tensor_t permute1_outputs[] = {*p_permute1_out};
-        Qnn_OpConfig_t permute1_op = ggmlqnn_create_op_config("permute1", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                              QNN_OP_TRANSPOSE, permute1_params, 1,
-                                                              permute1_inputs, 1, permute1_outputs, 1);
+        Qnn_OpConfig_t permute1_op      = ggmlqnn_create_op_config("permute1", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                              QNN_OP_TRANSPOSE, permute1_params, 1,
+                                              permute1_inputs, 1, permute1_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, permute1_op));
 
         // Reshape src1 to [B1, K, N]
         uint32_t reshape1_out_dims[] = {B1, K, N};
-        p_reshape1_out = GQCGT(nullptr, "reshape1_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                               reshape1_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape1_out));
-        Qnn_Tensor_t reshape1_inputs[] = {*p_permute1_out};
+        p_reshape1_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "reshape1_out",
+                           QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                           reshape1_out_dims, nullptr, 0);
+
+        Qnn_Tensor_t reshape1_inputs[]  = {*p_permute1_out};
         Qnn_Tensor_t reshape1_outputs[] = {*p_reshape1_out};
-        Qnn_OpConfig_t reshape1_op = ggmlqnn_create_op_config("reshape1", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                              QNN_OP_RESHAPE, nullptr, 0,
-                                                              reshape1_inputs, 1, reshape1_outputs, 1);
+        Qnn_OpConfig_t reshape1_op      = ggmlqnn_create_op_config("reshape1", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                              QNN_OP_RESHAPE, nullptr, 0,
+                                              reshape1_inputs, 1, reshape1_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape1_op));
 
         // MatMul: [B1, M, K] x [B1, K, N] -> [B1, M, N]
         uint32_t matmul_out_dims[] = {B1, M, N};
-        p_matmul_out = GQCGT(nullptr, "matmul_out", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                             matmul_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_matmul_out));
-        Qnn_Tensor_t matmul_inputs[] = {*p_tile0_out, *p_reshape1_out};
-        Qnn_Tensor_t matmul_outputs[] = {*p_matmul_out};
-        Qnn_OpConfig_t matmul_op = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                            QNN_OP_MAT_MUL, nullptr, 0,
-                                                            matmul_inputs, 2, matmul_outputs, 1);
+        p_matmul_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "matmul_out",
+                           QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                           matmul_out_dims, nullptr, 0);
+
+        Qnn_Tensor_t matmul_inputs[]    = {*p_tile0_out, *p_reshape1_out};
+        Qnn_Tensor_t matmul_outputs[]   = {*p_matmul_out};
+        Qnn_OpConfig_t matmul_op        = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                              QNN_OP_MAT_MUL, nullptr, 0,
+                                              matmul_inputs, 2, matmul_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
 
         // Output: [N, M, H1, B1] -> QNN: [B1, H1, M, N]
-        uint32_t reshape2_out_dims[] = {static_cast<uint32_t>(dst->ne[3]), static_cast<uint32_t>(dst->ne[2]), static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[0])};
-        p_reshape2_out = GQCGT(dst, "output", QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
-                               reshape2_out_dims, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_reshape2_out));
-        Qnn_Tensor_t reshape2_inputs[] = {*p_matmul_out};
+        uint32_t reshape2_out_dims[] = {static_cast<uint32_t>(dst->ne[3]), static_cast<uint32_t>(dst->ne[2]),
+                                        static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[0])
+        };
+        p_reshape2_out = ggmlqnn_create_general_tensor(instance, graph_handle, dst, "output",
+                             QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
+                             reshape2_out_dims, nullptr, 0);
+
+        Qnn_Tensor_t reshape2_inputs[]  = {*p_matmul_out};
         Qnn_Tensor_t reshape2_outputs[] = {*p_reshape2_out};
-        Qnn_OpConfig_t reshape2_op = ggmlqnn_create_op_config("reshape2", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                              QNN_OP_RESHAPE, nullptr, 0,
-                                                              reshape2_inputs, 1, reshape2_outputs, 1);
+        Qnn_OpConfig_t reshape2_op      = ggmlqnn_create_op_config("reshape2", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                              QNN_OP_RESHAPE, nullptr, 0,
+                                              reshape2_inputs, 1, reshape2_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape2_op));
 
         // Finalize
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
 
         // Cache
-        qnn_tensors_t ggml_op_mulmat_tensors = {p_tensor0, p_reshape0_out, p_tile0_out, p_tensor1, p_permute1_out, p_reshape1_out, p_matmul_out, p_reshape2_out};
-        instance->_qnn_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+        qnn_ptensors_t ggml_op_mulmat_tensors = {p_tensor0, p_reshape0_out, p_tile0_out, p_tensor1,
+                                                 p_permute1_out, p_reshape1_out, p_matmul_out, p_reshape2_out
+        };
+        ctx->qnn_singlenode_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
     }
 
     // Execute
-    QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
-    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
+    QNN_VER_PTR(*p_tensor0)->clientBuf      = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
+    QNN_VER_PTR(*p_tensor1)->clientBuf      = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
     QNN_VER_PTR(*p_reshape2_out)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
 
-    Qnn_Tensor_t input_tensors[] = {*p_tensor0, *p_tensor1};
-    Qnn_Tensor_t output_tensors[] = {*p_reshape2_out};
-    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2,
-                                                        output_tensors, 1, NULL, NULL));
-
-#if 0
-    // Log dst for debugging
-    float *dst_data = (float *)dst->data;
-    GGMLQNN_LOG_DEBUG("dst shape: [%d, %d, %d, %d]\n", dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3]);
-    for (int i = 0; i < dst->ne[0] * dst->ne[1] * dst->ne[2] * dst->ne[3]; i++) {
-        GGMLQNN_LOG_DEBUG("dst[%d] = %f\n", i, dst_data[i]);
-    }
-#endif
+    Qnn_Tensor_t input_tensors[]    = {*p_tensor0, *p_tensor1};
+    Qnn_Tensor_t output_tensors[]   = {*p_reshape2_out};
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2, output_tensors, 1, NULL, NULL));
 
     op_perf.info();
 }
@@ -4431,18 +4744,13 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
 
              in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
              operation when offloading mulmat to QNN backend. this implementation will handle transpose
-             in func ggml_qnn_create_general_tensor()
- *
- *        this function is a good example to illustrated the second technical approach "mapping the
- *        entire ggml computational graph to QNN graph" without complex C++ encapsulation. or another
- *        pipeline of "how to utilize the Hexagon NPU maximally through QNN SDK", details could be found at
- *        https://github.com/ggml-org/llama.cpp/pull/12049#issuecomment-2678308360
- *
+             in func ggmlqnn_compute_create_general_tensor()
+
  * @param ctx     the context of ggml-qnn backend
  * @param op      the destination tensor where the result of the matrix multiplication will be stored.
  *
- * @note the logic of ggml_qnn_mul_mat is similar to ggml_qnn_general_node but much more complicated
- *       than ggml_qnn_general_node. so it's a standalone function. accordingly, this is another
+ * @note the logic of ggmlqnn_compute_mul_mat is similar to ggmlqnn_compute_op_two_tensors but much more complicated
+ *       than ggmlqnn_compute_op_two_tensors. so it's a standalone function. accordingly, this is another
  *       typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute
  *       time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds
  *       of MUL_MAT to compute:
@@ -4451,9 +4759,8 @@ static void ggml_qnn_mul_mat_4d(ggml_backend_qnn_context *ctx, ggml_tensor *op)
  *       mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, Q6_K...)
  *                        and src1 is F32, src0 -> f32 in src0', then src0' * src1
 */
-void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
+static void ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
-    qnn_perf op_perf                            = qnn_perf("ggml_qnn_mul_mat");
     qnn_instance * instance                     = nullptr;
     Qnn_GraphHandle_t graph_handle              = nullptr;
     Qnn_Tensor_t * p_tensor0                    = nullptr;
@@ -4468,7 +4775,7 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
     instance                                    = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-    op_perf.start();
+
 
     const enum ggml_type src0_type              = src0->type;
     const uint32_t src0_rank                    = ggml_n_dims(src0);
@@ -4476,20 +4783,25 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     GGML_ASSERT(src0_rank == src1_rank);
     GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
     if (4 == src0_rank) {
-        return ggml_qnn_mul_mat_4d(ctx, op);
+        return ggmlqnn_compute_mul_mat_4d(ctx, op);
     }
-    void * wdata                                = ggmlqnn_type_trait(ctx, op);
-    const size_t desired_size                   = ctx->desired_size;
 
     ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
 
     std::string graph_name;
     ggmlqnn_get_graphkey_from_op(op, graph_name);
-    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+
+    qnn_perf op_perf                            = qnn_perf(graph_name);
+    op_perf.start();
+
+    void * wdata                                = ggmlqnn_type_trait(ctx, op);
+    const size_t desired_size                   = ctx->desired_size;
+
+    if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
         //retrieve computational resource from cached QNN graph
-        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
+        qnn_singlenode_res_t & graph_item  = ctx->qnn_singlenode_graph_map[graph_name];
         graph_handle            = std::get<0>(graph_item);
-        qnn_tensors_t & tensors = std::get<1>(graph_item);
+        qnn_ptensors_t & tensors = std::get<1>(graph_item);
         p_tensor0               = tensors[0];
         p_tensor1               = tensors[1];
         p_tensor2               = tensors[2];
@@ -4498,7 +4810,7 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     } else {
         //create QNN graph
         GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str());
-        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8, 4);
+        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), g_qnn_params.vtcm_size_in_mb, g_qnn_params.hvx_threads);
         if (QNN_SUCCESS != error) {
             GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
             return;
@@ -4506,12 +4818,15 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         graph_handle = instance->get_qnn_graph_handle();
 
         //create computational tensor
-        p_tensor0 = GQCGT(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor1 = GQCGT(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        p_tensor2 = GQCGT(dst, nullptr, QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor0));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor1));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2));
+        p_tensor0 = ggmlqnn_create_general_tensor(instance, graph_handle, src0, nullptr,
+                        QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank,
+                        nullptr, nullptr, 0);
+        p_tensor1 = ggmlqnn_create_general_tensor(instance, graph_handle, src1, nullptr,
+                        QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank,
+                        nullptr, nullptr, 0);
+        p_tensor2 = ggmlqnn_create_general_tensor(instance, graph_handle, dst, nullptr,
+                        QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, src0_rank,
+                        nullptr, nullptr, 0);
 
         //create param tensor for offload 2d/3d/4d matrix multiplication
         const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
@@ -4521,32 +4836,36 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
                 {0, 1, 3, 2},
         };
         uint32_t param_tensor_dims[1] = {src0_rank};
-        p_param_tensor = GQCGT(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, param_tensor_dims, (void *)(param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
+        p_param_tensor = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "param",
+                             QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, param_tensor_dims,
+                             (void *)(param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
 
         //create transpose tensor
-        p_tensor2_transpose = GQCGT(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0, true);
-        CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_tensor2_transpose));
+        p_tensor2_transpose = ggmlqnn_create_general_tensor(instance, graph_handle, dst, "transpose",
+                                  QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank,
+                                  nullptr, nullptr, 0, true);
 
         //compose QNN graph: add mulmat node
         Qnn_Param_t out_0_params[]   = {{QNN_PARAMTYPE_SCALAR, QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}}};
         Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0, *p_tensor1};
         Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
-        Qnn_OpConfig_t out_0         = ggmlqnn_create_op_config("mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_MAT_MUL, out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
+        Qnn_OpConfig_t out_0         = ggmlqnn_create_op_config("mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                           QNN_OP_MAT_MUL, out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
 
         //compose QNN graph: add transpose node
         Qnn_Param_t out_trans1_0_params[]   = { {QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_param_tensor}};
         Qnn_Tensor_t out_trans1_0_inputs[]  = {*p_tensor2_transpose};
         Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
-        Qnn_OpConfig_t out_trans1_0         = ggmlqnn_create_op_config("mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW, QNN_OP_TRANSPOSE, out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
+        Qnn_OpConfig_t out_trans1_0         = ggmlqnn_create_op_config("mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                  QNN_OP_TRANSPOSE, out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
 
         //finalize QNN graph
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
 
         //cache QNN graph
-        qnn_tensors_t ggml_op_mulmat_tensors;
+        qnn_ptensors_t ggml_op_mulmat_tensors;
         ggml_op_mulmat_tensors.reserve(5);
         ggml_op_mulmat_tensors.push_back(p_tensor0);
         ggml_op_mulmat_tensors.push_back(p_tensor1);
@@ -4554,7 +4873,7 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
         ggml_op_mulmat_tensors.push_back(p_param_tensor);
         ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
         auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
-        instance->_qnn_graph_map[graph_name] = graph_item;
+        ctx->qnn_singlenode_graph_map[graph_name] = graph_item;
     }
 
     if (src0_type != GGML_TYPE_F32) {
@@ -4579,127 +4898,127 @@ void ggml_qnn_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     op_perf.info();
 }
 
-void ggml_qnn_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value) {
+static void ggmlqnn_compute_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
     GGML_UNUSED(value);
 }
 
-void ggml_qnn_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    ggml_qnn_dup(ctx, dst);
+static void ggmlqnn_compute_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+    ggmlqnn_compute_dup(ctx, dst);
 }
 
-void ggml_qnn_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
 
-void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
+static void ggmlqnn_compute_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
@@ -4707,13 +5026,14 @@ void ggml_qnn_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
 // =================================================================================================
 //  section-10: second approach: mapping ggml computational cgraph to QNN graph
 // =================================================================================================
+// TODO: remove duplicated codes between section-9 and section-10
+// TODO: the graph algorithm in this section is naive, should optimized by AI experts
 // details: https://github.com/ggml-org/llama.cpp/pull/12326#issuecomment-2712838649
 // ref:     https://github.com/kantv-ai/kantv/blob/kantv-poc-with-qnn/core/ggml/jni/Inception_v3.cpp#L20634
-// TODO: mapping entire ggml cgraph to a single QNN graph
-static enum ggml_status ggmlqnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static enum ggml_status ggmlqnn_backend_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     enum ggml_status ggml_result                = GGML_STATUS_SUCCESS;
     Qnn_ErrorHandle_t qnn_error                 = QNN_SUCCESS;
-    qnn_perf op_perf                            = qnn_perf("ggmlqnn_graph_compute");
+    qnn_perf op_perf                            = qnn_perf("ggml_backend_qnn_graph_compute_special");
     qnn_instance * instance                     = nullptr;
     Qnn_GraphHandle_t graph_handle              = nullptr;
     ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *) backend->context;
@@ -4721,7 +5041,7 @@ static enum ggml_status ggmlqnn_graph_compute(ggml_backend_t backend, struct ggm
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
     op_perf.start();
 
-    //now we got the entire ggml cgraph
+    //now we got the entire ggml cgraph or a ggml cgraph which contains multiple nodes
     GGMLQNN_LOG_DEBUG("qnn device %d(%s)", ctx->device, ggml_backend_qnn_get_devname(ctx->device));
     GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
     int num_nodes = std::min(5, cgraph->n_nodes);
@@ -4731,13 +5051,16 @@ static enum ggml_status ggmlqnn_graph_compute(ggml_backend_t backend, struct ggm
         GGMLQNN_LOG_DEBUG("%s: op %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
     }
 
-    //now we'll offload the entire ggml cgraph to a single opcfg QNN graph
+    //now we'll offload the ggml cgraph to a single QNN graph
     std::string graph_name;
     ggmlqnn_get_graphkey_from_cgraph(cgraph, graph_name);
-    if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
+    if (graph_name == "")
+        return GGML_STATUS_SUCCESS;
+    if (ctx->qnn_multinode_graph_map.find(graph_name) != ctx->qnn_multinode_graph_map.end()) {
+        GGMLQNN_LOG_DEBUG("graph name %s already create", graph_name.c_str());
         //retrieve computational resource from cached QNN graph
-        qnn_res_t & graph_item  = instance->_qnn_graph_map[graph_name];
-        graph_handle            = std::get<0>(graph_item);
+        qnn_multinode_res_t &graph_res = ctx->qnn_multinode_graph_map[graph_name];
+        graph_handle = std::get<0>(graph_res);
     } else {
         //create QNN graph
         GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str());
@@ -4748,20 +5071,14 @@ static enum ggml_status ggmlqnn_graph_compute(ggml_backend_t backend, struct ggm
             return ggml_result;
         }
         graph_handle = instance->get_qnn_graph_handle();
-        //TODO: compose a single opcfg QNN graph
+        //TBD: compose a single QNN graph
 
-        //TODO: finalize QNN graph
-        //CHECK_QNN_API(qnn_error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+        //finalize QNN graph
+        CHECK_QNN_API(qnn_error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
 
-        //cache QNN graph
-        qnn_tensors_t ggml_op_mulmat_tensors;
-        ggml_op_mulmat_tensors.reserve(0);
-        auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
-        instance->_qnn_graph_map[graph_name] = graph_item;
+        //TBD: cache QNN graph
     }
-    //exec QNN graph
-
-    GGMLQNN_LOG_DEBUG("the second inference approach \"mapping cgraph to QNN graph\" is actually not supported now");
+    //TBD: exec QNN graph
 
     return ggml_result;
 }
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 393f4d458f41b..6f117680071e5 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -7,7 +7,6 @@ PWD=`pwd`
 ANDROID_PLATFORM=android-34
 ANDROID_NDK=${PWD}/android-ndk-r26c
 REMOTE_PATH=/data/local/tmp/
-GGUF_MODEL_NAME=/sdcard/deepseek-r1-distill-qwen-1.5b-q4_0.gguf
 GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
 
 #QNN SDK could be found at:
@@ -18,8 +17,7 @@ QNN_SDK_INSTALL_PATH=/opt/qcom/aistack/qairt/
 QNN_SDK_VERSION=2.32.0.250228
 QNN_SDK_PATH=${QNN_SDK_INSTALL_PATH}/${QNN_SDK_VERSION}
 
-#default is QNN NPU
-qnnbackend=2
+qnnparams=" -mg 2 -ngl 99 "
 
 function dump_vars()
 {
@@ -188,7 +186,7 @@ function run_llamacli()
 
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -ngl 99 -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
+               && ${REMOTE_PATH}/llama-cli ${qnnparams} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
 
 }
 
@@ -199,12 +197,11 @@ function run_llamabench()
 
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-bench -mg ${qnnbackend} -m ${GGUF_MODEL_NAME}"
+               && ${REMOTE_PATH}/llama-bench ${qnnparams} -m ${GGUF_MODEL_NAME}"
 
 }
 
 
-#refer to:https://github.com/ggml-org/llama.cpp/pull/12155
 function run_test-ops()
 {
     prepare_run_on_phone test-backend-ops
@@ -215,37 +212,6 @@ function run_test-ops()
 
 }
 
-function run_test-op()
-{
-    prepare_run_on_phone test-backend-ops
-
-    qnnbackendname=qnn-cpu
-    case $qnnbackend in
-        0)
-        qnnbackendname=qnn-cpu
-        ;;
-        1)
-        qnnbackendname=qnn-gpu
-        ;;
-        2)
-        qnnbackendname=qnn-npu
-        ;;
-        *)
-        qnnbackendname=qnn-cpu
-        ;;
-    esac
-
-    #debug
-    echo "adb shell cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/test-backend-ops test -o $opname -b $qnnbackendname "
-
-    echo "\n"
-    adb shell "cd ${REMOTE_PATH} \
-               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/test-backend-ops test -o $opname -b $qnnbackendname "
-
-}
 
 function print_oplist()
 {
@@ -335,9 +301,8 @@ function show_usage()
     echo "  $0 build"
     echo "  $0 updateqnnlib"
     echo "  $0 run_testops"
-    echo "  $0 run_testop          [ADD/MUL/MUL_MAT......(op from print_oplist)]  [0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU)]"
-    echo "  $0 run_llamacli        0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
-    echo "  $0 run_llamabench      0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
+    echo "  $0 run_llamacli"
+    echo "  $0 run_llamabench"
 
     echo -e "\n\n\n"
 }
@@ -367,40 +332,19 @@ elif [ $# == 1 ]; then
     elif [ "$1" == "run_testops" ]; then
         run_test-ops
         exit 0
-
-    elif [ "$1" == "updateqnnlib" ]; then
-        update_qnn_libs
-        exit 0
-    else
-        show_usage
-        exit 1
-    fi
-elif [ $# == 2 ]; then
-    qnnbackend=$2
-    if [ ${qnnbackend} -gt 3 ]; then
-        show_usage
-        exit 1
-    fi
-
-    if [ "$1" == "run_llamacli" ]; then
+    elif [ "$1" == "run_llamacli" ]; then
         run_llamacli
         exit 0
     elif [ "$1" == "run_llamabench" ]; then
         run_llamabench
         exit 0
-    fi
-elif [ $# == 3 ]; then
-    opname=$2
-#TODO: check opname in oplist
-#opname can be found via print_oplist:
-
-    qnnbackend=$3
-    if [ ${qnnbackend} -gt 3 ]; then
+    elif [ "$1" == "updateqnnlib" ]; then
+        update_qnn_libs
+        exit 0
+    else
         show_usage
         exit 1
     fi
-    run_test-op
-    exit 0
 else
     show_usage
     exit 1
diff --git a/scripts/ggml-qnn.cfg b/scripts/ggml-qnn.cfg
index 5796e613ff2af..b1a697ae12ed9 100644
--- a/scripts/ggml-qnn.cfg
+++ b/scripts/ggml-qnn.cfg
@@ -1,9 +1,28 @@
 [general]
+#0: QNN-CPU backend
+#1: QNN-GPU backend
+#2: QNN-NPU(htp) backend
+#3: default ggml backend
+qnn_backend = 2
+
 # enable/disable QNN's internal log
 print_qnn_internal_log = 0
+
+# enable/disable perf of op function
+enable_perf = 0
+
+# enable/disable print tensors info in op function
+print_tensors_info = 0
+
+# enable/disable dump op info in handle_op
+dump_op_info = 0
+
 # 0: general approach,similar to ggml-sycl or ggml-cann
 # 1: mapping entire ggml cgraph to QNN graph
 inference_approach = 0
 
 [npu]
-npu_inference_datatype = "fp16"
+hvx_threads = 4
+vtcm_size_in_mb = 8
+enable_dlbc = 1
+precision_mode = "fp16"

From 7008c61e1b481b37e1cafb698cb46a753da11fe9 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 18 Mar 2025 22:45:26 +0800
Subject: [PATCH 126/200] ggml-qnn: self code-review

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 834af1e08e30f..2e6281379d1b4 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -2547,7 +2547,7 @@ int qnn_instance::unload_system() {
     return result;
 }
 
-static void ggmlqnn_compute_logcallback(const char * fmt,
+static void ggmlqnn_sdk_logcallback(const char * fmt,
                                  QnnLog_Level_t level,
                                  uint64_t timestamp,
                                  va_list argp) {
@@ -2556,7 +2556,7 @@ static void ggmlqnn_compute_logcallback(const char * fmt,
         return;
 
     static std::mutex log_mutex;
-    static unsigned char s_ggmlqnn_compute_logbuf[GGML_QNN_LOGBUF_LEN];
+    static unsigned char s_ggmlqnn_sdk_logbuf[GGML_QNN_LOGBUF_LEN];
 
     const char * log_level_desc = "";
     switch (level) {
@@ -2583,9 +2583,9 @@ static void ggmlqnn_compute_logcallback(const char * fmt,
     double ms = (double) timestamp / 1000000.0;
     {
         std::lock_guard<std::mutex> lock(log_mutex);
-        memset(s_ggmlqnn_compute_logbuf, 0, GGML_QNN_LOGBUF_LEN);
-        vsnprintf(reinterpret_cast<char *const>(s_ggmlqnn_compute_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp);
-        GGMLQNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggmlqnn_compute_logbuf);
+        memset(s_ggmlqnn_sdk_logbuf, 0, GGML_QNN_LOGBUF_LEN);
+        vsnprintf(reinterpret_cast<char *const>(s_ggmlqnn_sdk_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp);
+        GGMLQNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggmlqnn_sdk_logbuf);
     }
 }
 
@@ -2625,9 +2625,9 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
 
     _qnn_interface.set_qnn_interface(_loaded_backend);
 #if 1
-    _qnn_interface.qnn_log_create(ggmlqnn_compute_logcallback, _qnn_log_level, &_qnn_log_handle);
+    _qnn_interface.qnn_log_create(ggmlqnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle);
 #else
-    _qnn_raw_interface.logCreate(ggmlqnn_compute_logcallback, _qnn_log_level, &_qnn_log_handle);
+    _qnn_raw_interface.logCreate(ggmlqnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle);
 #endif
     if (nullptr == _qnn_log_handle) {
         GGMLQNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone
@@ -3476,7 +3476,7 @@ static bool ggmlqnn_same_types(const ggml_backend_qnn_context * ctx, const ggml_
     return true;
 }
 
-static bool ggmlqnn_compute_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * op_tensor) {
+static bool ggmlqnn_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * op_tensor) {
     if (op_tensor->op == GGML_OP_NONE) {
         return true;
     }
@@ -3567,7 +3567,7 @@ static bool ggmlqnn_compute_can_handle_op(const ggml_backend_qnn_context * ctx,
     }
 }
 
-static bool ggmlqnn_compute_compute_forward(ggml_backend_t backend, struct ggml_tensor * dst) {
+static bool ggmlqnn_compute_forward(ggml_backend_t backend, struct ggml_tensor * dst) {
     ggmlqnn_op_func_t func          = nullptr;
     ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *)backend->context;
 
@@ -3913,7 +3913,7 @@ static enum ggml_status ggmlqnn_backend_graph_compute_general(ggml_backend_t bac
             || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
             continue;
         }
-        bool ok = ggmlqnn_compute_compute_forward(backend, node);
+        bool ok = ggmlqnn_compute_forward(backend, node);
         if (!ok) {
             GGMLQNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
         }
@@ -4074,7 +4074,7 @@ static ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_host_ptr(ggml_b
 
 static bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
     ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) dev->context;
-    return (ggmlqnn_compute_can_handle_op(ctx,op));
+    return (ggmlqnn_can_handle_op(ctx,op));
 }
 
 static bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
@@ -5033,7 +5033,7 @@ static void ggmlqnn_compute_rope(ggml_backend_qnn_context * ctx, ggml_tensor * d
 static enum ggml_status ggmlqnn_backend_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     enum ggml_status ggml_result                = GGML_STATUS_SUCCESS;
     Qnn_ErrorHandle_t qnn_error                 = QNN_SUCCESS;
-    qnn_perf op_perf                            = qnn_perf("ggml_backend_qnn_graph_compute_special");
+    qnn_perf op_perf                            = qnn_perf("ggmlqnn_backend_graph_compute_special");
     qnn_instance * instance                     = nullptr;
     Qnn_GraphHandle_t graph_handle              = nullptr;
     ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *) backend->context;

From 4bb6f0ad41f125db1fc3e1b17b6cc05baf06abe6 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 19 Mar 2025 20:35:16 +0800
Subject: [PATCH 127/200] ggml-qnn: rebase upstream

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 2e6281379d1b4..8e98a042df93b 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -3299,6 +3299,12 @@ static uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_t
 }
 
 static void ggmlqnn_load_cfg() {
+    //this function can be called in various scenarios
+    static bool initialized = false;
+    if (initialized) {
+        GGMLQNN_LOG_DEBUG("qnn cfg file already loadded\n");
+        return;
+    }
     char time_string[GGML_QNN_TMPBUF_LEN];
     memset(time_string, 0, GGML_QNN_TMPBUF_LEN);
     ggmlqnn_get_timestring(time_string);
@@ -3333,6 +3339,7 @@ static void ggmlqnn_load_cfg() {
     } else {
         g_qnn_params.precision_mode = 0;
     }
+    initialized = true;
 }
 
 static Qnn_Tensor_t * ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
@@ -4008,8 +4015,7 @@ static ggml_backend_t ggml_backend_qnn_device_init_backend(ggml_backend_dev_t de
     GGMLQNN_LOG_INFO("enter %s\n", __func__);
     size_t dev_index = 0;
 
-    //case-1: special scenario, such as test-backend-ops or other similar scenairo: calling ggml_backend_qnn_device_init_backend directly in user's applicaton
-    //call ggmlqnn_load_cfg accordingly in this place
+    //case-1: test-backend-ops or other similar scenairo: calling ggml_backend_dev_init(dev, reinterpret_cast<const char *>(i)) directly in user's code
     ggmlqnn_load_cfg();
     GGMLQNN_LOG_INFO("user's specified qnn_backend in cfgfile = %d", g_qnn_params.qnn_backend);
     GGMLQNN_LOG_INFO("user's sepcified qnn runtime lib path in cfgfile = %s", g_qnn_params.qnn_runtimelib_path);
@@ -4188,8 +4194,8 @@ ggml_backend_reg_t ggml_backend_qnn_reg() {
     static ggml_backend_reg reg;
     static bool initialized = false;
     GGMLQNN_LOG_DEBUG("enter ggml_backend_qnn_reg");
+
     //case-2: normal scenario, such as llama-cli or UI applicaton
-    //call ggmlqnn_load_cfg accordingly in this place
     ggmlqnn_load_cfg();
     GGMLQNN_LOG_INFO("user's specified qnn_backend=%d", g_qnn_params.qnn_backend);
     GGMLQNN_LOG_INFO("user's sepcified qnn runtime lib path=%s", g_qnn_params.qnn_runtimelib_path);
@@ -4238,6 +4244,8 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
     int result = 0;
 
     GGMLQNN_LOG_INFO("enter %s\n", __func__);
+    //case-3: calling ggml_backend_qnn_init() directly in user's code
+    ggmlqnn_load_cfg();
 
     if (nullptr == qnn_lib_path)
         return nullptr;

From f5fcc0a361b27dc8c3d9189964e91a72e1785394 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sat, 22 Mar 2025 22:44:41 +0800
Subject: [PATCH 128/200] ggml-qnn: add approach through Hexagon cDSP

---
 ggml/include/ggml-qnn.h                     |   17 +-
 ggml/src/ggml-qnn/CMakeLists.txt            |   60 +-
 ggml/src/ggml-qnn/ggml-qnn.cpp              | 1324 ++++++++++++++++---
 ggml/src/ggml-qnn/kernels/ggmlop.h          |  289 ++++
 ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c     |  237 ++++
 ggml/src/ggml-qnn/kernels/ggmlop_stub.c     |  437 ++++++
 ggml/src/ggml-qnn/kernels/libggmlop_skel.so |  Bin 0 -> 13896 bytes
 scripts/build-run-android.sh                |    4 +-
 scripts/ggml-qnn.cfg                        |   11 +-
 9 files changed, 2139 insertions(+), 240 deletions(-)
 create mode 100644 ggml/src/ggml-qnn/kernels/ggmlop.h
 create mode 100644 ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
 create mode 100644 ggml/src/ggml-qnn/kernels/ggmlop_stub.c
 create mode 100755 ggml/src/ggml-qnn/kernels/libggmlop_skel.so

diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h
index 06f143546ad24..2ff2bef9dcf7d 100644
--- a/ggml/include/ggml-qnn.h
+++ b/ggml/include/ggml-qnn.h
@@ -42,26 +42,11 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char
 
 GGML_BACKEND_API bool           ggml_backend_is_qnn(ggml_backend_t backend);
 
-GGML_BACKEND_API void           ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int thread_counts);
-
 GGML_BACKEND_API int            ggml_backend_qnn_get_device_count(void);
 
 GGML_BACKEND_API ggml_backend_reg_t ggml_backend_qnn_reg(void);
 
-inline const char * ggml_backend_qnn_get_devname(size_t dev_num) {
-    switch (dev_num) {
-        case QNN_BACKEND_CPU:
-            return "QNN-CPU";
-        case QNN_BACKEND_GPU:
-            return "QNN-GPU";
-        case QNN_BACKEND_NPU:
-            return "QNN-NPU";
-        case QNN_BACKEND_GGML:
-            return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML
-        default:
-            return "unknown";
-    }
-}
+const char * ggml_backend_qnn_get_devname(size_t dev_num);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt
index fcbbc33a9b136..c63faca10e842 100644
--- a/ggml/src/ggml-qnn/CMakeLists.txt
+++ b/ggml/src/ggml-qnn/CMakeLists.txt
@@ -1,37 +1,59 @@
 message(STATUS "Using QNN backend")
 message("CMAKE_SYSTEM_NAME : ${CMAKE_SYSTEM_NAME}")
 
+if(NOT DEFINED QNN_SDK_PATH)
+    message(FATAL_ERROR "QNN_SDK_PATH not defined")
+endif()
+
+if(NOT DEFINED HEXAGON_SDK_PATH)
+    message(FATAL_ERROR "HEXAGON_SDK_PATH not defined")
+endif()
+
+message("QNN_SDK_PATH:     ${QNN_SDK_PATH}")
+message("HEXAGON_SDK_PATH: ${HEXAGON_SDK_PATH}")
+
 if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     find_library(LOG_LIB log)
-    set(QNN_LINK_LIBRARIES ${LOG_LIB})
+
+    add_library(cdsprpc
+        SHARED
+        IMPORTED)
+    set_target_properties(cdsprpc
+        PROPERTIES
+        IMPORTED_LOCATION
+        ${HEXAGON_SDK_PATH}/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc.so)
+
+    set(QNN_LINK_LIBRARIES ${LOG_LIB} cdsprpc)
     set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
+
+    include_directories(${HEXAGON_SDK_PATH}/incs)
+    include_directories(${HEXAGON_SDK_PATH}/incs/stddef)
+    include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/incs)
+    include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc)
+    include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/remote/ship/android_Debug_aarch64)
+    include_directories(${HEXAGON_SDK_PATH}/incs/qnx)
+    include_directories(${HEXAGON_SDK_PATH}/libs/common/qnx/ship/android_Debug_aarch64)
+    include_directories(${HEXAGON_SDK_PATH}/utils/examples)
+    include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/android_aarch64)
+    include_directories(${HEXAGON_SDK_PATH}/libs/atomic/inc)
+    include_directories(${HEXAGON_SDK_PATH}/libs/atomic/android_Debug_aarch64/ship)
+    include_directories(${CMAKE_SOURCE_DIR}/ggml/src/ggml-qnn/)
+    include_directories(${CMAKE_SOURCE_DIR}/ggml/src/ggml-qnn/kernels/)
+
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
     set(QNN_DEFAULT_LIB_SEARCH_PATH "C:\\" CACHE STRING "customized library search path for QNN backend")
 else()
     message(FATAL_ERROR "QNN now only available on Android and Windows(Windows on ARM)")
 endif()
 
-if(NOT DEFINED GGML_QNN_SDK_PATH)
-# try read from environment variable
-    if(DEFINED ENV{QNN_SDK_PATH})
-        set(GGML_QNN_SDK_PATH $ENV{QNN_SDK_PATH})
-    else()
-        message(FATAL_ERROR "GGML_QNN_SDK_PATH not defined")
-    endif()
-endif()
-
-message("QNN_SDK_PATH: ${GGML_QNN_SDK_PATH}")
-
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_QNN")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
 
-file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp")
-    ggml_add_backend_library(ggml-qnn
-    ${QNN_SOURCES}
-)
+file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/*.c")
+ggml_add_backend_library(ggml-qnn ${QNN_SOURCES})
 
-target_include_directories(ggml-qnn PRIVATE ${GGML_QNN_SDK_PATH}/include/QNN ${CMAKE_CURRENT_LIST_DIR})
+target_include_directories(ggml-qnn PRIVATE ${QNN_SDK_PATH}/include/QNN ${HEXAGON_SDK_PATH} ${CMAKE_CURRENT_LIST_DIR})
 target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES})
 
-string(REGEX REPLACE "/$" "" GGML_QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
-target_compile_definitions(ggml-qnn PRIVATE GGML_QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/")
+string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
+target_compile_definitions(ggml-qnn PRIVATE QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/")
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 8e98a042df93b..23fe675aa97f3 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -6,24 +6,26 @@
  * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
  *
  * this single-source-file or self-contained implementation of ggml-qnn backend has 10 sections:
- * section-1  forward/prototype declaration
- * section-2  global vars, macros, data structures
- * section-3  ggml-qnn internal troubleshooting function/class
- * section-4  helper function for WoA(Windows on ARM)
- * section-5  general helper function
- * section-6  QNN helper function
+ * section-1  forward/prototype declaration, global vars, macros, data structures
+ * section-2  ggml-qnn internal troubleshooting function/class
+ * section-3  helper function for WoA(Windows on ARM)
+ * section-4  general helper function
+ * section-5  QNN helper function
+ * section-6  Hexagon DSP helper function
  * section-7  ggml-qnn backend helper function / class
  * section-8  implementation of ggml-qnn backend according to ggml's backend subsystem
- * section-9  implementation of general approach or the first tech approach
- * section-10 implementation of the second tech approach:mapping the entire ggml cgraph to a single QNN graph
+ * section-9  implementation of general approach through QNN and Hexagon DSP
+ * section-10 implementation of special approach through QNN:mapping the entire ggml cgraph to a single QNN graph
  *
- * currently provide following ggml op' QNN backend implementation:
- * - GGML_OP_ADD/GGML_OP_SUB/GGML_OP_MUL/GGML_OP_DIV:
- *   this is a simple skeleton, can expand other ggml ops according to expertise
- * - GGML_OP_LOG/GGML_OP_SQRT:
+ * currently provide following ggml op' implementation through QNN:
+ * - GGML_OP_ADD/GGML_OP_SUB/GGML_OP_MUL/GGML_OP_DIV/GGML_OP_LOG/GGML_OP_SQRT:
  *   this is a simple skeleton, can expand other ggml ops according to expertise
  * - GGML_OP_MUL_MAT:
- *   this is a complicated skeleton, can expand other complex ggml ops accordingly
+ *   this is a complicated skeleton, can expand other ggml ops accordingly
+ *
+ *  currently provide following ggml op' implementation through Hexagon DSP:
+ * - GGML_OP_ADD:
+ *   this is a skeleton, can expand other ggml ops accordingly
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
@@ -51,15 +53,6 @@
 #include <inttypes.h>
 #include <math.h>
 #include <time.h>
-#if defined(__ANDROID__) || defined(__linux__)
-#include <unistd.h>
-#include <dlfcn.h>
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <sys/sysinfo.h>
-#include <unistd.h>
-#include <stdatomic.h>
-#endif
 
 #include <string>
 #include <vector>
@@ -83,8 +76,15 @@
 #include <unordered_set>
 #include <utility>
 #include <future>
-#if (defined __ANDROID__) || (defined ANDROID)
-#include "android/log.h"
+
+#if defined(__ANDROID__) || defined(__linux__)
+#include <unistd.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <sys/sysinfo.h>
+#include <unistd.h>
+#include <stdatomic.h>
 #endif
 
 #if !defined(__ANDROID__) && !defined(__linux__)
@@ -93,6 +93,18 @@
 #include <Windows.h>
 #endif
 
+#if defined(__ANDROID__)
+#include "android/log.h"
+
+#include "rpcmem.h"
+#include "remote.h"
+#include "os_defines.h"
+#include "domain.h"
+#include "AEEStdErr.h"
+#include "HAP_power.h"
+#include "HAP_farf.h"
+#endif
+
 #include "QnnTypes.h"
 #include "QnnCommon.h"
 #include "QnnContext.h"
@@ -110,16 +122,20 @@
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 
+#include "kernels/ggmlop.h"
+
 // =================================================================================================
-//  section-1: forward/prototype declaration, macro
+//  section-1: forward/prototype declaration, global vars, macros, data structures
 // =================================================================================================
 class  qnn_instance;
 struct qnn_parameter;
 struct ggml_backend_qnn_context;
 
 typedef void (* ggmlqnn_op_func_t)(ggml_backend_qnn_context * ctx, ggml_tensor * op);
+typedef int  (* notif_callback_fn)(void * context, int domain, int session, remote_rpc_status_flags_t status);
+typedef int  (* ggmlhexagon_op_func_t)(remote_handle64 handle, const dsptensor * src0, const dsptensor * src1, dsptensor * dst);
 
-//general function prototypes for ggml-qnn backend
+static void *           ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op);
 static void             ggmlqnn_dump_tensor(const ggml_tensor * tensor, const char * name);
 static enum ggml_status ggmlqnn_backend_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph * cgraph);
 static void             ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
@@ -132,7 +148,8 @@ static Qnn_Tensor_t *   ggmlqnn_create_general_tensor(qnn_instance * instance, Q
                                                      void * data, uint32_t data_size,
                                                      bool b_transpose = false);
 
-//function prototypes for all op functions in the first tech approach(general approach in other backends)
+
+//function prototypes for all op functions in the general approach
 //general op function for elment-wise operation on 1/2 input tensors and 1 output tensor
 static void ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
@@ -163,7 +180,7 @@ static void ggmlqnn_compute_upsample_nearest2d(ggml_backend_qnn_context * ctx, g
 static void ggmlqnn_compute_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void ggmlqnn_compute_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value);
 
-//function prototypes for all op functions in the second tech approach("mapping the entire cgraph to a single QNN graph")
+//function prototypes for all op functions in the special approach("mapping the entire cgraph to a single QNN graph")
 static void ggmlqnn_graph_addnode(ggml_backend_qnn_context * ctx, struct ggml_cgraph * cgraph,
                 Qnn_GraphHandle_t graph_handle, std::string & graph_name, ggml_tensor * op, bool is_reuse_graph = false);
 
@@ -192,6 +209,7 @@ static void ggmlqnn_graph_addnode(ggml_backend_qnn_context * ctx, struct ggml_cg
 #define QNN_VER_PTR(x)                                  (&((x).v1))
 #define RPCMEM_DEFAULT_FLAGS                            1
 #define RPCMEM_HEAP_ID_SYSTEM                           25
+#define STATUS_CONTEXT                                  0x12345678
 
 #define QNN_TENSOR_GET_ID(tensor)                       get_qnn_tensorid(tensor)
 #define QNN_TENSOR_GET_NAME(tensor)                     get_qnn_tensorname(tensor)
@@ -239,13 +257,15 @@ static void ggmlqnn_graph_addnode(ggml_backend_qnn_context * ctx, struct ggml_cg
 
 #define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                              \
     do {                                                                        \
-        if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {           \
-            return;                                                             \
+        if (g_qnn_params.inference_approach != DIRECT_USE_CDSP) {               \
+            if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {       \
+                return;                                                         \
+            }                                                                   \
         }                                                                       \
-    } while (0)
+    } while (0)                                                                 \
 
 // =================================================================================================
-//  section-2: data type, data structure, global vars
+//  section-1: data type, data structure, global vars
 // =================================================================================================
 using pfn_rpc_mem_init                          = void (*)(void);
 using pfn_rpc_mem_deinit                        = void (*)(void);
@@ -268,10 +288,27 @@ using qnn_cgraph_node_t                         = std::tuple<std::string, qnn_te
 using qnn_cgraph_nodes_t                        = std::vector<qnn_cgraph_node_t>;
 using qnn_multinode_res_t                       = std::tuple<Qnn_GraphHandle_t, qnn_cgraph_nodes_t, qnn_ptensors_t, qnn_tensors_t, qnn_tensors_t>;
 
-enum class qnn_profile_level {
-    profile_off     = 0,
-    profile_basic   = 1,
-    profile_detail  = 2
+enum qnn_profile_level {
+    PROFILE_OFF     = 0,
+    PROFILE_BASIC   = 1,
+    PROFILE_DETAIL  = 2
+};
+
+//0: general approach through QNN
+//1: general approach through Hexagon cDSP
+//2: special approach through QNN:mapping entire ggml cgraph to QNN graph
+enum inference_approach {
+    QNN_GENERAL     = 0,
+    DIRECT_USE_CDSP = 1,
+    QNN_SINGLEGRAPH = 2,
+};
+
+enum hexagon_dsp_type {
+    HEXAGON_ADSP    = 0,
+    HEXAGON_MDSP    = 1,
+    HEXAGON_SDSP    = 2,
+    HEXAGON_CDSP    = 3,
+    HEXAGON_CDSP1   = 4,
 };
 
 enum qcom_htp_arch {
@@ -328,6 +365,10 @@ struct ggml_backend_qnn_context {
     size_t work_size;
     size_t desired_size;
     int n_threads;
+
+    size_t rpc_mempool_len;
+    void * rpc_mempool;
+    remote_handle64 ggmlop_handle;
 };
 
 struct qnn_op_caps {
@@ -347,21 +388,14 @@ struct qnn_parameter {
     int hvx_threads;
     int vtcm_size_in_mb;
     int enable_dlbc;
-    int inference_approach;     // 0: general approach,similar to ggml-sycl or ggml-cann 1: mapping entire ggml cgraph to QNN graph
-    int qnn_backend;            // 0: QNN-CPU backend, 1: QNN-GPU backend, 2: QNN-NPU backend
+    int inference_approach;     // 0: QNN_GENERAL     1: DIRECT_USE_CDSP 2: QNN_SINGELGRAPH
+    int qnn_backend;            // 0: QNN-CPU backend 1: QNN-GPU backend 2: QNN-NPU backend
     const char * qnn_cfgfilename;
     const char * qnn_runtimelib_path;
 };
 
-//TODO:I don't think threadsafe is required at the moment
-//     so we can uniform them to avoid compiler/toolchain's complains
-#if !defined(__ANDROID__) && !defined(__linux__)
-static std::atomic<int32_t> g_qnntensor_idx(0); //ensure every QNN tensor name is unique
-static std::atomic<int32_t> g_qnnopcfg_idx(0);  //ensure every QNN opconfig name is unique
-#else
 static int32_t g_qnntensor_idx = 0; //ensure every QNN tensor name is unique
 static int32_t g_qnnopcfg_idx  = 0; //ensure every QNN opconfig name is unique
-#endif
 
 static struct qnn_parameter g_qnn_params = {
         .print_qnn_internal_log = 0,
@@ -514,6 +548,14 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .socinfo              = {}},
 };
 
+static domain hexagon_supported_domains[] = {
+        {ADSP_DOMAIN_ID, ADSP_DOMAIN},
+        {MDSP_DOMAIN_ID, MDSP_DOMAIN},
+        {SDSP_DOMAIN_ID, SDSP_DOMAIN},
+        {CDSP_DOMAIN_ID, CDSP_DOMAIN},
+        {CDSP1_DOMAIN_ID, CDSP1_DOMAIN}
+};
+
 static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = {
         {true,  GGML_OP_NONE, nullptr, 0, nullptr},
         {false, GGML_OP_DUP},
@@ -624,7 +666,7 @@ static_assert(std::size(ggmlqnn_k_op_caps) == (GGML_OP_COUNT + GGML_UNARY_OP_COU
         "pls check ggmlqnn_k_op_caps and ensure is corresponding to latest ggml.h");
 
 // =================================================================================================
-//  section-3: ggml-qnn internal troubleshooting function/class
+//  section-2: ggml-qnn internal troubleshooting function/class
 // =================================================================================================
 static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
     static std::mutex ggmlqnn_log_internal_mutex;
@@ -762,7 +804,7 @@ class qnn_perf {
 };
 
 // =================================================================================================
-//  section-4: helper function for WoA(Window on ARM)
+//  section-3: helper function for WoA(Window on ARM)
 // =================================================================================================
 #if !defined(__ANDROID__) && !defined(__linux__)
 #define RTLD_GLOBAL 0x100
@@ -817,7 +859,7 @@ static const char * dlerror(void) {
 #endif
 
 // =================================================================================================
-//  section-5: general helper function
+//  section-4: general helper function
 // =================================================================================================
 //TODO: merge the following 6 helper functions which used to ensure every QNN tensor/opcfg name is unique
 static void ggmlqnn_reset_tensoridx() {
@@ -829,11 +871,7 @@ static void ggmlqnn_inc_tensoridx() {
 }
 
 static int32_t ggmlqnn_get_tensoridx() {
-#if !defined(__ANDROID__) && !defined(__linux__)
-    return g_qnntensor_idx.load();
-#else
     return g_qnntensor_idx;
-#endif
 }
 
 static void ggmlqnn_reset_opcfgidx() {
@@ -845,11 +883,7 @@ static void ggmlqnn_inc_opcfgidx() {
 }
 
 static int32_t ggmlqnn_get_opcfgidx() {
-#if !defined(__ANDROID__) && !defined(__linux__)
-    return g_qnnopcfg_idx.load();
-#else
     return g_qnnopcfg_idx;
-#endif
 }
 
 static void * ggmlqnn_mallocz_aligned(size_t size, size_t alignment) {
@@ -994,7 +1028,7 @@ static void ggmlqnn_get_timestring(char * p_currenttime) {
 }
 
 // =================================================================================================
-//  section-6: QNN helper function
+//  section-5: QNN helper function
 // =================================================================================================
 static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
@@ -1342,6 +1376,810 @@ static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * p
     return opcfg;
 }
 
+// =================================================================================================
+//  section-6: Hexagon DSP helper function
+// =================================================================================================
+static const char * ggmlhexagon_get_dsp_name(int domain_id) {
+    switch (domain_id) {
+        case HEXAGON_ADSP:
+            return "Hexagon-aDSP";
+        case HEXAGON_MDSP:
+            return "Hexagon-mDSP";
+        case HEXAGON_SDSP:
+            return "Hexagon-sDSP";
+        case HEXAGON_CDSP:
+            return "Hexagon-cDSP";
+        case HEXAGON_CDSP1:
+            return "Hexagon-cDSP1";
+        default:
+            return "Hexagon-unknown";
+    }
+}
+
+static int ggmlhexagon_pd_status_notifier_callback(void * context, int domain, int session, remote_rpc_status_flags_t status){
+    int error = AEE_SUCCESS;
+    switch (status){
+        case  FASTRPC_USER_PD_UP:
+            GGMLQNN_LOG_DEBUG("PD is up\n");
+            break;
+        case  FASTRPC_USER_PD_EXIT:
+            GGMLQNN_LOG_DEBUG("PD closed\n");
+            break;
+        case  FASTRPC_USER_PD_FORCE_KILL:
+            GGMLQNN_LOG_DEBUG("PD force kill\n");
+            break;
+        case  FASTRPC_USER_PD_EXCEPTION:
+            GGMLQNN_LOG_DEBUG("PD exception\n");
+            break;
+        case  FASTRPC_DSP_SSR:
+            GGMLQNN_LOG_DEBUG("DSP SSR\n");
+            break;
+        default :
+            error =  AEE_EBADITEM;
+            break;
+    }
+    return error;
+}
+
+static domain * ggmlhexagon_get_domain(int domain_id) {
+    int size = sizeof(hexagon_supported_domains) / sizeof(domain);
+
+    for (size_t i = 0; i < size; i++) {
+        if (hexagon_supported_domains[i].id == domain_id)
+            return &hexagon_supported_domains[i];
+    }
+
+    return nullptr;
+}
+
+static bool ggmlhexagon_is_cdsp(int domain_id) {
+    return (domain_id == HEXAGON_CDSP) || (domain_id == HEXAGON_CDSP1);
+}
+
+static bool ggmlhexagon_is_valid_domain_id(int domain_id, int compute_only) {
+    int size = sizeof(hexagon_supported_domains) / sizeof(domain);
+
+    if (compute_only) {
+        return ggmlhexagon_is_cdsp(domain_id);
+    }
+
+    for (size_t i = 0; i < size; i++) {
+        if (hexagon_supported_domains[i].id == domain_id)
+            return true;
+    }
+
+    return false;
+}
+
+static int ggmlhexagon_get_domains_info(const char * domain_type, int * num_domains, fastrpc_domain ** domains_info) {
+    int hexagon_err = AEE_SUCCESS;
+    int ss_info     = 0;
+    ss_info = strcmp(domain_type, "NSP")? HPASS: NSP;
+    system_req_payload req;
+    memset(&req, 0, sizeof(system_req_payload));
+    req.id = FASTRPC_GET_DOMAINS;
+    req.sys.domains = nullptr;
+    fastrpc_domain * domain = nullptr;
+
+    if (ss_info != 0) {
+        req.sys.flags = DOMAINS_LIST_FLAGS_SET_TYPE(req.sys.flags, ss_info);
+    } else {
+        req.sys.flags =0;
+    }
+
+#ifdef _WIN32
+    hexagon_err = AEE_EUNSUPPORTED;
+    goto bail;
+#endif
+
+    if (remote_system_request) {
+        hexagon_err = remote_system_request(&req);
+        if (hexagon_err != AEE_SUCCESS) {
+            GGMLQNN_LOG_DEBUG("failure in remote_system_request call: %d", hexagon_err);
+            goto bail;
+        }
+        //allocate memory for domain-info array
+        req.sys.max_domains = req.sys.num_domains;
+        void * buffer = calloc(req.sys.num_domains, sizeof(fastrpc_domain));
+        if (nullptr == buffer) {
+            hexagon_err = AEE_ENOMEMORY;
+            GGMLQNN_LOG_DEBUG("unable to allocate memory for req.sys.domains");
+            goto bail;
+        }
+        req.sys.domains = static_cast<fastrpc_domain *>(buffer);
+        hexagon_err = remote_system_request(&req);
+        if (hexagon_err != AEE_SUCCESS) {
+            GGMLQNN_LOG_DEBUG("failure in remote_system_request call: %d.\n", hexagon_err);
+            goto bail;
+        }
+
+        for (int i = 0; i < req.sys.num_domains; i++) {
+            //verify that only requested type domains were returned
+            domain = &req.sys.domains[i];
+            if (domain->type != ss_info) {
+                hexagon_err = -1;
+                GGMLQNN_LOG_DEBUG("incorrect data received from remote_system_request.\n");
+                goto bail;
+            }
+        }
+        *domains_info = req.sys.domains;
+        *num_domains = req.sys.num_domains;
+    } else {
+        hexagon_err = AEE_EUNSUPPORTED;
+        goto bail;
+    }
+
+bail:
+    if (hexagon_err && !req.sys.domains) {
+        free(req.sys.domains);
+    }
+    return hexagon_err;
+}
+
+static int ggmlhexagon_get_dsp_support(int * domain) {
+    int hexagon_error = AEE_SUCCESS;
+    *domain = HEXAGON_CDSP;
+
+    if (remote_handle_control) {
+        struct remote_dsp_capability dsp_capability_domain = {HEXAGON_CDSP, DOMAIN_SUPPORT, 0};
+        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
+        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+            GGMLQNN_LOG_DEBUG("FastRPC Capability API is not supported on this device");
+            goto bail;
+        }
+
+        if (0 == dsp_capability_domain.capability) {
+            dsp_capability_domain.domain = HEXAGON_ADSP;
+            dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT;
+            dsp_capability_domain.capability = 0;
+            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
+            if(dsp_capability_domain.capability) {
+                *domain = HEXAGON_ADSP;
+            }
+        }
+
+        if (hexagon_error != AEE_SUCCESS) {
+            GGMLQNN_LOG_DEBUG("get_dsp_support failed with error 0x%x", hexagon_error);
+            goto bail;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLQNN_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
+    }
+
+bail:
+    return hexagon_error;
+}
+
+static int ggmlhexagon_get_vtcm_info(int domain, uint32_t * capability, uint32_t attr) {
+    int hexagon_error = AEE_SUCCESS;
+    *capability = 0;
+
+    if (attr == VTCM_PAGE || attr == VTCM_COUNT) {
+    } else {
+        hexagon_error = AEE_EBADPARM;
+        GGMLQNN_LOG_DEBUG("unsupported attr, only VTCM_PAGE and VTCM_COUNT supported");
+        goto bail;
+    }
+
+    if (remote_handle_control) {
+        if (domain == HEXAGON_ADSP || domain == HEXAGON_CDSP) {
+            /*
+            * query the DSP for VTCM information
+            * since the ADSP does not have a dedicated VTCM, we expect the output to be 0
+            */
+            struct remote_dsp_capability dsp_capability_vtcm_dsp;
+            dsp_capability_vtcm_dsp.domain = (uint32_t)domain;
+            dsp_capability_vtcm_dsp.attribute_ID = attr;
+            dsp_capability_vtcm_dsp.capability = (uint32_t)0;
+            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp, sizeof(struct remote_dsp_capability));
+            if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                GGMLQNN_LOG_DEBUG("FastRPC Capability API is not supported on this device");
+                GGMLQNN_LOG_DEBUG("running the use case without checking the capability");
+                hexagon_error = AEE_SUCCESS;
+                goto bail;
+            } else if (hexagon_error == AEE_SUCCESS) {
+                *capability = dsp_capability_vtcm_dsp.capability;
+            } else {
+                GGMLQNN_LOG_DEBUG("get_vtcm_info failed with error 0x%x", hexagon_error);
+                goto bail;
+            }
+        } else {
+            hexagon_error = AEE_EUNSUPPORTED;
+            GGMLQNN_LOG_DEBUG("unsupported domain %d", domain);
+            goto bail;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLQNN_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
+    }
+
+bail:
+    return hexagon_error;
+}
+
+static bool ggmlhexagon_is_unsignedpd_supported(int domain_id) {
+    int hexagon_error = AEE_SUCCESS;
+    if (remote_handle_control) {
+        struct remote_dsp_capability dsp_capability_domain = {static_cast<uint32_t>(domain_id), UNSIGNED_PD_SUPPORT, 0};
+        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
+        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+            GGMLQNN_LOG_WARN("FastRPC Capability API is not supported on this device. Falling back to signed pd");
+            return false;
+        }
+
+        if (hexagon_error) {
+            GGMLQNN_LOG_WARN("error 0x%x: FastRPC Capability API failed. falling back to signed pd", hexagon_error);
+            return false;
+        }
+
+        if (dsp_capability_domain.capability == 1) {
+            return true;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLQNN_LOG_WARN("remote_dsp_capability interface is not supported on this device.falling back to signed pd");
+        return false;
+    }
+
+    return false;
+}
+
+static bool ggmlhexagon_get_unsignedpd_support(void) {
+    return ggmlhexagon_is_unsignedpd_supported(HEXAGON_CDSP);
+}
+
+static bool ggmlhexagon_is_async_fastrpc_supported(int domain) {
+    int hexagon_error = AEE_SUCCESS;
+    if (remote_handle_control) {
+        if (domain == HEXAGON_CDSP) {
+            /*
+            * Query the DSP for ASYNC_FASTRPC_SUPPORT information
+            * Async fastrpc is supported only on CDSP
+            */
+            struct remote_dsp_capability dsp_capability_async_support;
+            dsp_capability_async_support.domain = (uint32_t)domain;
+            dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT;
+            dsp_capability_async_support.capability = (uint32_t)0;
+            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support, sizeof(struct remote_dsp_capability));
+            if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                GGMLQNN_LOG_WARN("FastRPC Capability API is not supported on this device");
+                hexagon_error = AEE_SUCCESS;
+                goto bail;
+            } else if (dsp_capability_async_support.capability == 1) {
+                return true;
+            }
+
+            if (hexagon_error != AEE_SUCCESS){
+                GGMLQNN_LOG_WARN("failed with error 0x%x", hexagon_error);
+                goto bail;
+            }
+        } else {
+            hexagon_error = AEE_EUNSUPPORTED;
+            GGMLQNN_LOG_WARN("async FastRPC is not supported on domain %d", domain);
+            goto bail;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLQNN_LOG_WARN("remote_dsp_capability interface is not supported on this device");
+    }
+
+bail:
+    return false;
+}
+
+static void ggmlhexagon_set_rpc_latency(int domain, int qos, int latency) {
+    int hexagon_error = AEE_SUCCESS;
+
+    if (remote_handle_control) {
+        struct remote_rpc_control_latency data;
+#if 1
+        data.enable = RPC_PM_QOS;
+        data.latency = 300;
+#else
+        data.enable = RPC_POLL_QOS;
+        data.latency = 1000;
+#endif
+        data.enable = qos;
+        data.latency = latency;
+        hexagon_error = remote_handle64_control(DSPRPC_GET_DSP_INFO, DSPRPC_CONTROL_LATENCY, (void*)&data, sizeof(data));
+        if (hexagon_error != AEE_SUCCESS){
+            GGMLQNN_LOG_WARN("failed with error 0x%x", hexagon_error);
+            goto bail;
+        } else {
+            GGMLQNN_LOG_INFO("set rpc qos %d, latency %d\n", qos, latency);
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLQNN_LOG_WARN("remote_dsp_capability interface is not supported on this device");
+    }
+
+bail:
+    return;
+}
+
+static bool ggmlhexagon_is_status_notification_supported(int domain) {
+    int hexagon_error = AEE_SUCCESS;
+
+    if (remote_handle_control) {
+        /*
+        * Query the DSP for STATUS_NOTIFICATION_SUPPORT information
+        * DSP User PD status notification Support
+        */
+        struct remote_dsp_capability dsp_capability_status_notification_support;
+        dsp_capability_status_notification_support.domain = (uint32_t)domain;
+        dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT;
+        dsp_capability_status_notification_support.capability = (uint32_t)0;
+        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support, sizeof(struct remote_dsp_capability));
+        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+            GGMLQNN_LOG_WARN("FastRPC Capability API is not supported on this device");
+            hexagon_error = AEE_SUCCESS;
+            goto bail;
+        } else if (1 == dsp_capability_status_notification_support.capability) {
+            return true;
+        }
+
+        if (hexagon_error != AEE_SUCCESS){
+            GGMLQNN_LOG_WARN("failed with error 0x%x", hexagon_error);
+            goto bail;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLQNN_LOG_WARN("remote_dsp_capability interface is not supported on this device");
+    }
+
+bail:
+    return false;
+}
+
+static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) {
+    int hexagon_error = AEE_SUCCESS;
+    *capability = 0;
+
+    if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) {
+        hexagon_error = AEE_EBADPARM;
+        GGMLQNN_LOG_WARN("unsupported attr, only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported");
+        goto bail;
+    }
+
+    if (remote_handle_control) {
+        if (domain == HEXAGON_CDSP) {
+            /*
+            * Query the DSP for HMX SUPPORT information
+            * HMX is supported on CDSP only
+            */
+            struct remote_dsp_capability dsp_capability_hmx_dsp;
+            dsp_capability_hmx_dsp.domain = (uint32_t)domain;
+            dsp_capability_hmx_dsp.attribute_ID = attr;
+            dsp_capability_hmx_dsp.capability = (uint32_t)0;
+            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp, sizeof(struct remote_dsp_capability));
+            if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                GGMLQNN_LOG_DEBUG("FastRPC Capability API is not supported on this device");
+                hexagon_error = AEE_SUCCESS;
+                goto bail;
+            }
+            else if (hexagon_error == AEE_SUCCESS) {
+                *capability = dsp_capability_hmx_dsp.capability;
+            } else {
+                GGMLQNN_LOG_DEBUG("get_hmx_support_info failed with Error 0x%x", hexagon_error);
+                goto bail;
+            }
+        } else {
+            hexagon_error = AEE_EUNSUPPORTED;
+            GGMLQNN_LOG_DEBUG("HMX support is not there for domain %d", domain);
+            goto bail;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLQNN_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
+    }
+
+bail:
+    return hexagon_error;
+}
+
+static int ggmlhexagon_get_hex_arch_ver(int domain, uint32_t * capability) {
+    int hexagon_error = AEE_SUCCESS;
+    *capability = 0;
+    if(remote_handle_control) {
+        /*
+        * Query the Hexagon processor architecture version information
+        */
+        struct remote_dsp_capability dsp_capability_arch_ver;
+        dsp_capability_arch_ver.domain = (uint32_t)domain;
+        dsp_capability_arch_ver.attribute_ID = ARCH_VER;
+        dsp_capability_arch_ver.capability = (uint32_t)0;
+        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_arch_ver, sizeof(struct remote_dsp_capability));
+        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+            GGMLQNN_LOG_DEBUG("FastRPC Capability API is not supported on this device");
+            hexagon_error = AEE_SUCCESS;
+            goto bail;
+        } else if (hexagon_error == AEE_SUCCESS) {
+            *capability = dsp_capability_arch_ver.capability;
+        } else {
+            GGMLQNN_LOG_DEBUG("get_hex_arch_ver failed with error 0x%x", hexagon_error);
+            goto bail;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLQNN_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
+    }
+
+    bail:
+    return hexagon_error;
+}
+
+static int ggmlhexagon_get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr)
+{
+    int hexagon_error = AEE_SUCCESS;
+    *capability = 0;
+    if (attr == HVX_SUPPORT_64B) {
+        hexagon_error = AEE_EBADPARM;
+        GGMLQNN_LOG_DEBUG("latest targets have 128 byte HVX register, use HVX_SUPPORT_128B instead of HVX_SUPPORT_64B");
+        goto bail;
+    }
+
+    if (attr != HVX_SUPPORT_128B) {
+        hexagon_error = AEE_EBADPARM;
+        GGMLQNN_LOG_DEBUG("unsupported attr. only HVX_SUPPORT_128B supported");
+        goto bail;
+    }
+
+    if (remote_handle_control) {
+        if (domain == HEXAGON_CDSP) {
+            /*
+            * Query the DSP for HVX SUPPORT information
+            * HVX is supported on CDSP only
+            */
+            struct remote_dsp_capability dsp_capability_hvx_dsp;
+            dsp_capability_hvx_dsp.domain = (uint32_t)domain;
+            dsp_capability_hvx_dsp.attribute_ID = attr;
+            dsp_capability_hvx_dsp.capability = (uint32_t)0;
+            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp, sizeof(struct remote_dsp_capability));
+            if ((hexagon_error & 0xFF)==(AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                GGMLQNN_LOG_DEBUG("FastRPC Capability API is not supported on this device");
+                hexagon_error = AEE_SUCCESS;
+                goto bail;
+            } else if (hexagon_error == AEE_SUCCESS) {
+                *capability = dsp_capability_hvx_dsp.capability;
+            } else {
+                GGMLQNN_LOG_DEBUG("failed with error 0x%x", hexagon_error);
+                goto bail;
+            }
+        } else {
+            hexagon_error = AEE_EUNSUPPORTED;
+            GGMLQNN_LOG_DEBUG("HVX support is not available on domain %d", domain);
+            goto bail;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLQNN_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
+    }
+
+bail:
+    return hexagon_error;
+}
+
+static int ggmlhexagon_request_status_notifications(int domain_id, void * context, notif_callback_fn call_back_fn) {
+    int hexagon_error = AEE_SUCCESS;
+    struct remote_rpc_notif_register notif;
+    bool status_notification_support;
+
+    notif.context = context;
+    notif.domain = domain_id;
+    notif.notifier_fn = call_back_fn;
+
+    status_notification_support = ggmlhexagon_is_status_notification_supported(domain_id);
+    if (status_notification_support) {
+        hexagon_error = remote_session_control(FASTRPC_REGISTER_STATUS_NOTIFICATIONS, (void*)&notif, sizeof(notif));
+        if (hexagon_error != AEE_SUCCESS) {
+            GGMLQNN_LOG_DEBUG("error 0x%x: remote_session_control failed to enable status notifications", hexagon_error);
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+    }
+
+    return hexagon_error;
+}
+
+//TODO:not work on cDSP currently
+static AEEResult ggmlhexagon_set_clocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled) {
+#if 0
+    GGMLQNN_LOG_DEBUG("----------- entering power set clocks");
+
+    HAP_power_request_t request;
+    memset(&request, 0, sizeof(HAP_power_request_t));
+    request.type = HAP_power_set_apptype;
+    request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
+
+    void * benchmark_ctx = (void*)(handle);
+    int retval = HAP_power_set(benchmark_ctx, &request);
+    if (retval)  {
+        GGMLQNN_LOG_WARN("failed first power vote");
+        return AEE_EFAILED;
+    }
+
+    //configure clocks & DCVS mode
+    memset(&request, 0, sizeof(HAP_power_request_t));
+    request.type = HAP_power_set_DCVS_v2;
+    request.dcvs_v2.dcvs_enable = TRUE;
+    request.dcvs_v2.dcvs_params.target_corner = (HAP_dcvs_voltage_corner_t)power_level;
+    if (dcvs_enabled) {
+        request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_DISABLE;
+        request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_DISABLE;
+    } else {
+        request.dcvs_v2.dcvs_params.min_corner = request.dcvs_v2.dcvs_params.target_corner;
+        request.dcvs_v2.dcvs_params.max_corner = request.dcvs_v2.dcvs_params.target_corner;
+    }
+    request.dcvs_v2.dcvs_option     = HAP_DCVS_V2_PERFORMANCE_MODE;
+    request.dcvs_v2.set_dcvs_params = TRUE;
+    request.dcvs_v2.set_latency     = TRUE;
+    request.dcvs_v2.latency         = latency;
+    retval = HAP_power_set(benchmark_ctx, &request);
+    if (retval) {
+        GGMLQNN_LOG_WARN("failed to vote for performance mode");
+        return AEE_EFAILED;
+    }
+
+    memset(&request, 0, sizeof(HAP_power_request_t));
+    request.type = HAP_power_set_HVX;
+    request.hvx.power_up = TRUE;
+    retval = HAP_power_set(benchmark_ctx, &request);
+    if (retval) {
+        GGMLQNN_LOG_WARN("failed to vote for HVX power");
+        return AEE_EFAILED;
+    }
+#endif
+    return AEE_SUCCESS;
+}
+
+static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
+    int hexagon_error               = AEE_SUCCESS;
+
+    int domain_id                   = HEXAGON_CDSP;
+    const char * domain_type        = "NSP";
+
+    int unsignedpd_flag             = 1;
+    bool is_unsignedpd_enabled      = false;
+    int use_logical_id              = 0;
+    int core_id                     = -1;
+    fastrpc_domain * domains_info   = NULL;
+    fastrpc_domain * domain_info    = NULL;
+    int num_domains                 = -1;
+
+    domain * my_domain              = NULL;
+    char * uri                      = NULL;
+
+    char * ggmlop_domain_uri        = NULL;
+    int    ggmlop_domain_uri_len    = 0;
+
+    if (nullptr == ctx)
+        return 1;
+    GGMLQNN_LOG_INFO("init Hexagon DSP with backend %d(%s)", ctx->device, ggml_backend_qnn_get_devname(ctx->device));
+    //TODO: reasonable rpc memory pool size and use it practically
+    ctx->ggmlop_handle = -1;
+    ctx->rpc_mempool_len  = (1 << 20) * 512;
+    ctx->rpc_mempool = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, ctx->rpc_mempool_len);
+    if (nullptr == ctx->rpc_mempool) {
+        hexagon_error = AEE_ENORPCMEMORY;
+        printf("rpc memory alloc failed", hexagon_error);
+        ctx->rpc_mempool_len = 0;
+        return 2;
+    }
+
+    if (domain_id == -1) {
+        if (domain_type != NULL) {
+            if ((strcmp(domain_type, "NSP") != 0 && strcmp(domain_type, "HPASS") != 0)) {
+                GGMLQNN_LOG_WARN("invalid domain_type %s. possible values are NSP or HPASS", domain_type);
+                goto bail;
+            } else {
+                hexagon_error = ggmlhexagon_get_domains_info(domain_type, &num_domains, &domains_info);
+                if (hexagon_error == AEE_EUNSUPPORTED) {
+                    GGMLQNN_LOG_DEBUG("API is not supported on this target so cannot get domains info from the device. falling back to legacy approach of using default domain id");
+                    hexagon_error = ggmlhexagon_get_dsp_support(&domain_id);
+                    if (hexagon_error != AEE_SUCCESS) {
+                        GGMLQNN_LOG_DEBUG("error: 0x%x, defaulting to CDSP domain", hexagon_error);
+                    }
+                } else if (hexagon_error != AEE_SUCCESS) {
+                    GGMLQNN_LOG_DEBUG("error in getting domains information");
+                    goto bail;
+                } else {
+                    if (core_id != -1) {
+                        if (core_id < 0 || core_id >= num_domains) {
+                            GGMLQNN_LOG_DEBUG("invalid core_id = %d for %s. core_id should be between 0 to %d", core_id, domain_type, num_domains - 1);
+                            hexagon_error = AEE_EBADPARM;
+                            goto bail;
+                        }
+                    } else {
+                        core_id = 0;
+                    }
+                    use_logical_id = 1;
+                    domain_id = domains_info[core_id].id;
+                }
+            }
+        } else {
+            GGMLQNN_LOG_DEBUG("DSP domain is not provided, retrieving DSP information using Remote APIs");
+            hexagon_error = ggmlhexagon_get_dsp_support(&domain_id);
+            if (hexagon_error != AEE_SUCCESS) {
+                GGMLQNN_LOG_DEBUG("error: 0x%x, defaulting to CDSP domain", hexagon_error);
+            }
+        }
+    }
+
+    if (0 == use_logical_id) {
+        if (!ggmlhexagon_is_valid_domain_id(domain_id, 0)) {
+            hexagon_error = AEE_EBADPARM;
+            GGMLQNN_LOG_DEBUG("error 0x%x: invalid domain %d", hexagon_error, domain_id);
+            goto bail;
+        }
+
+        my_domain = ggmlhexagon_get_domain(domain_id);
+        if (nullptr == my_domain) {
+            GGMLQNN_LOG_DEBUG("unable to get domain struct %d",  domain_id);
+            goto bail;
+        }
+        uri = my_domain->uri;
+    } else {
+        domain_info = &domains_info[domain_id];
+        uri = (char *)malloc(MAX_DOMAIN_NAMELEN);
+        if (nullptr == uri) {
+            hexagon_error = AEE_ENOMEMORY;
+            GGMLQNN_LOG_DEBUG("unable to allocated memory for uri of size: %d", MAX_DOMAIN_NAMELEN);
+            goto bail;
+        }
+        snprintf(uri, MAX_DOMAIN_NAMELEN, "%s%s", "&_dom=", domain_info->name);
+    }
+    GGMLQNN_LOG_INFO("\ndomain uri=%s\n", uri);
+
+    if (1 == unsignedpd_flag) {
+        is_unsignedpd_enabled = ggmlhexagon_is_unsignedpd_supported(domain_id);
+        if (!is_unsignedpd_enabled) {
+            GGMLQNN_LOG_DEBUG("overriding user request for unsigned PD, only signed offload is allowed on domain %d", domain_id);
+            unsignedpd_flag = 0;
+        }
+    }
+
+    GGMLQNN_LOG_INFO("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
+    GGMLQNN_LOG_INFO("unsignedpd_enabled %d", is_unsignedpd_enabled);
+    if (is_unsignedpd_enabled) {
+        if (remote_session_control) {
+            struct remote_rpc_control_unsigned_module data;
+            data.enable = 1;
+            data.domain = domain_id;
+            hexagon_error = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *)&data, sizeof(data));
+            GGMLQNN_LOG_DEBUG("remote_session_control returned %d for configuring unsigned PD success", hexagon_error);
+            if (AEE_SUCCESS != hexagon_error) {
+                GGMLQNN_LOG_DEBUG("error 0x%x: remote_session_control failed", hexagon_error);
+            }
+        } else {
+            GGMLQNN_LOG_DEBUG("unsigned PD not supported on this device");
+            hexagon_error = AEE_EUNSUPPORTED;
+            GGMLQNN_LOG_DEBUG("error 0x%x: remote_session_control interface is not supported on this device", hexagon_error);
+        }
+    }
+
+    hexagon_error = ggmlhexagon_request_status_notifications(domain_id, (void *)STATUS_CONTEXT, ggmlhexagon_pd_status_notifier_callback);
+    if (AEE_SUCCESS != hexagon_error) {
+        if (AEE_EUNSUPPORTEDAPI != hexagon_error) {
+            GGMLQNN_LOG_WARN("error 0x%x: hexagon_request_status_notifications failed", hexagon_error);
+        }
+        GGMLQNN_LOG_WARN("error 0x%x: failed to compute on domain %d", hexagon_error, domain_id);
+        goto bail;
+    }
+
+    ggmlop_domain_uri_len   = strlen(ggmlop_URI) + MAX_DOMAIN_NAMELEN;
+    ggmlop_domain_uri       = (char *)malloc(ggmlop_domain_uri_len);
+    snprintf(ggmlop_domain_uri, ggmlop_domain_uri_len, "%s%s", ggmlop_URI, uri);
+    GGMLQNN_LOG_INFO("ggmlop domain uri:%s\n", ggmlop_domain_uri);
+    hexagon_error = ggmlop_open(ggmlop_domain_uri, &ctx->ggmlop_handle);
+    if (AEE_SUCCESS == hexagon_error) {
+        GGMLQNN_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
+        GGMLQNN_LOG_INFO("only support GGML_OP_ADD on cDSP currently\n");
+        ggmlhexagon_set_clocks(ctx->ggmlop_handle, HAP_DCVS_V2_DUTY_CYCLE_MODE, 40, 1);
+        ggmlhexagon_set_rpc_latency(domain_id, RPC_POLL_QOS, 1000);
+    } else {
+        GGMLQNN_LOG_WARN("error 0x%x: failed to compute on domain %d(%s)", hexagon_error, domain_id,
+                         ggmlhexagon_get_dsp_name(domain_id));
+        goto bail;
+    }
+
+    return 0;
+bail:
+    if (ggmlop_domain_uri) {
+        free(ggmlop_domain_uri);
+    }
+
+    if (uri) {
+        free(uri);
+    }
+
+    if (ctx->rpc_mempool) {
+        rpcmem_free(ctx->rpc_mempool);
+        ctx->rpc_mempool = nullptr;
+        ctx->rpc_mempool_len = 0;
+        ctx->ggmlop_handle  = -1;
+    }
+
+    return -1;
+}
+
+static void ggmlhexagon_close_cdsp(ggml_backend_qnn_context * ctx) {
+    int hexagon_error  = AEE_SUCCESS;
+    GGMLQNN_LOG_DEBUG("enter %s", __func__);
+    if (-1 != ctx->ggmlop_handle) {
+        hexagon_error = ggmlop_close(ctx->ggmlop_handle);
+        if (AEE_SUCCESS != hexagon_error) {
+            GGMLQNN_LOG_WARN("error 0x%x: failed to close ggmlop handle", hexagon_error);
+        } else {
+            ctx->ggmlop_handle = -1;
+        }
+    }
+
+    if (ctx->rpc_mempool) {
+        rpcmem_free(ctx->rpc_mempool);
+        ctx->rpc_mempool = nullptr;
+        ctx->rpc_mempool_len = 0;
+    }
+    GGMLQNN_LOG_DEBUG("leave %s", __func__);
+}
+
+static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tensor * op) {
+    //skip sanity check because already checked in other place
+    struct dsptensor dsptensor_0;
+    struct dsptensor dsptensor_1;
+    struct dsptensor dsptensor_2;
+
+    int hexagon_error               = AEE_SUCCESS;
+    ggmlhexagon_op_func_t op_func   = nullptr;
+    void * wdata                    = nullptr;
+
+    ggml_tensor * src0              = op->src[0];
+    //TODO: src1 might-be nullptr
+    ggml_tensor * src1              = op->src[1];
+    ggml_tensor * dst               = op;
+    ggml_type src0_type = src0->type;
+
+    switch (op->op) {
+        case GGML_OP_ADD:
+            op_func = ggmlop_add;
+            break;
+        case GGML_OP_MUL_MAT: {
+            wdata = ggmlqnn_type_trait(ctx, op);
+            op_func = ggmlop_mulmat;
+            break;
+        }
+        default:
+            return;
+    }
+
+    if ((GGML_OP_MUL_MAT == op->op) && (src0_type != GGML_TYPE_F32)) {
+        dsptensor_0.data = static_cast<float *>(wdata);
+        dsptensor_0.dataLen = ctx->desired_size;
+    } else {
+        dsptensor_0.data = static_cast<float *>(src0->data);
+        dsptensor_0.dataLen = ggml_nbytes(src0);
+    }
+    dsptensor_1.data = static_cast<float *>(src1->data);
+    dsptensor_2.data = static_cast<float *>(dst->data);
+    dsptensor_0.type = GGML_TYPE_F32;
+    dsptensor_1.type = GGML_TYPE_F32;
+    dsptensor_2.type = GGML_TYPE_F32;
+    dsptensor_0.ne[0] = src0->ne[0];
+    dsptensor_0.ne[1] = src0->ne[1];
+    dsptensor_0.ne[2] = src0->ne[2];
+    dsptensor_0.ne[3] = src0->ne[3];
+    dsptensor_0.nb[0] = src0->nb[0];
+    dsptensor_0.nb[1] = src0->nb[1];
+    dsptensor_0.nb[2] = src0->nb[2];
+    dsptensor_0.nb[3] = src0->nb[3];
+    dsptensor_1.dataLen = ggml_nbytes(src1);
+    dsptensor_2.dataLen = ggml_nbytes(dst);
+    hexagon_error = op_func(ctx->ggmlop_handle, &dsptensor_0, &dsptensor_1, &dsptensor_2);
+    if (AEE_SUCCESS != hexagon_error) {
+        GGMLQNN_LOG_WARN("ggmlop computation fail on cdsp");
+    }
+}
+
 // =================================================================================================
 //  section-7:ggml-qnn backend helper function / class
 // =================================================================================================
@@ -1383,6 +2221,19 @@ static const char * ggmlqnn_get_htparch_desc(size_t htp_arch) {
     }
 }
 
+static const char * ggmlqnn_get_inference_approach_name(int inference_approach) {
+    switch (inference_approach) {
+        case 0:
+            return "QNN_GENERAL";
+        case 1:
+            return "DIRECT_USE_CDSP";
+        case 2:
+            return "QNN_SINGLEGRAPH";
+        default:
+            return "unknown approach";
+    }
+}
+
 static struct qcom_socinfo * ggmlqnn_get_socinfo_from_socmodel(uint32_t soc_model) {
     size_t items = sizeof(g_qnn_soc_info_table) / sizeof(g_qnn_soc_info_table[0]);
     for (size_t idx = 0; idx < items; idx++) {
@@ -2060,7 +2911,7 @@ class qnn_instance {
     bool _do_node_validations               = true;  // flag to indicate whether all add_node calls need to be validated
     QnnLog_Level_t _qnn_log_level           = QNN_LOG_LEVEL_DEBUG;
 
-    qnn_profile_level _profile_level   = qnn_profile_level::profile_off;
+    qnn_profile_level _profile_level        = PROFILE_OFF;
 
     void * _system_lib_handle               = nullptr;
     void * _loaded_lib_handle               = nullptr;
@@ -2710,9 +3561,9 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         GGMLQNN_LOG_INFO("create device successfully\n");
     }
 
-    if (qnn_profile_level::profile_off != _profile_level) {
+    if (PROFILE_OFF != _profile_level) {
         GGMLQNN_LOG_INFO("profiling turned on; level = %d", _profile_level);
-        if (qnn_profile_level::profile_basic == _profile_level) {
+        if (PROFILE_BASIC == _profile_level) {
             GGMLQNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n");
             if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
                     _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) {
@@ -2721,7 +3572,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
             } else {
                 GGMLQNN_LOG_DEBUG("initialize qnn profile successfully\n");
             }
-        } else if (qnn_profile_level::profile_detail == _profile_level) {
+        } else if (PROFILE_DETAIL == _profile_level) {
             GGMLQNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n");
             if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
                     _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) {
@@ -3279,6 +4130,38 @@ void qnn_instance::htp_enter_performance_mode() {
     }
 }
 
+static void ggmlqnn_set_runtime_path(size_t device, const std::string & path) {
+    if ((QNN_BACKEND_NPU == device) || (DIRECT_USE_CDSP == g_qnn_params.inference_approach)) {
+        if (0 == setenv("LD_LIBRARY_PATH",
+                        (path +
+                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
+                        1)) {
+            GGMLQNN_LOG_INFO("QNN NPU backend setenv successfully");
+        } else {
+            GGMLQNN_LOG_ERROR("QNN NPU backend setenv failure");
+        }
+        if (0 == setenv("ADSP_LIBRARY_PATH",
+                        (path +
+                         ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(),
+                        1)) {
+            GGMLQNN_LOG_INFO("QNN NPU backend setenv successfully");
+        } else {
+            GGMLQNN_LOG_ERROR("QNN NPU backend setenv failure");
+        }
+    } else {
+        if (0 == setenv("LD_LIBRARY_PATH",
+                        (path +
+                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
+                        1)) {
+            GGMLQNN_LOG_INFO("%s backend setenv successfully\n",
+                             ggml_backend_qnn_get_devname(device));
+        } else {
+            GGMLQNN_LOG_ERROR("%s backend setenv failure\n",
+                              ggml_backend_qnn_get_devname(device));
+        }
+    }
+}
+
 static uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) {
     if (nullptr == instance || nullptr == ggml_tensor || nullptr == qnn_tensor) {
         GGMLQNN_LOG_WARN("invalid params\n");
@@ -3302,7 +4185,7 @@ static void ggmlqnn_load_cfg() {
     //this function can be called in various scenarios
     static bool initialized = false;
     if (initialized) {
-        GGMLQNN_LOG_DEBUG("qnn cfg file already loadded\n");
+        GGMLQNN_LOG_INFO("qnn cfg file already loadded\n");
         return;
     }
     char time_string[GGML_QNN_TMPBUF_LEN];
@@ -3330,7 +4213,8 @@ static void ggmlqnn_load_cfg() {
     qnncfg_instance.get_intvalue("npu", "enable_dlbc", g_qnn_params.enable_dlbc, 0);
     qnncfg_instance.get_stringvalue("npu", "precision_mode", precision_mode, "fp32");
     GGMLQNN_LOG_INFO("print_qnn_internal_log=%d", g_qnn_params.print_qnn_internal_log);
-    GGMLQNN_LOG_INFO("inference_approach=%d", g_qnn_params.inference_approach);
+    GGMLQNN_LOG_INFO("inference_approach=%d(%s)", g_qnn_params.inference_approach,
+                     ggmlqnn_get_inference_approach_name(g_qnn_params.inference_approach));
     GGMLQNN_LOG_INFO("qnn_backend=%d", g_qnn_params.qnn_backend);
     GGMLQNN_LOG_INFO("npu inference precision mode=%s", precision_mode.c_str());
     GGMLQNN_LOG_INFO("qnn runtime lib path=%s", g_qnn_params.qnn_runtimelib_path);
@@ -3488,6 +4372,12 @@ static bool ggmlqnn_can_handle_op(const ggml_backend_qnn_context * ctx, const st
         return true;
     }
 
+    if (DIRECT_USE_CDSP == g_qnn_params.inference_approach) {
+        //FIXME: mulmat on cDSP doesn't work as expected
+        if (op_tensor->op != GGML_OP_ADD)
+            return false;
+    }
+
     if (!ggmlqnn_k_op_caps[ggmlqnn_get_op_index(op_tensor)].supported) {
         return false;
     }
@@ -3854,9 +4744,13 @@ static const char * ggml_backend_qnn_name(ggml_backend_t backend) {
 
 static void ggml_backend_qnn_free(ggml_backend_t backend) {
     GGMLQNN_LOG_DEBUG("enter %s", __func__ );
-    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context;
+    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *)backend->context;
     GGMLQNN_LOG_DEBUG("device idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name);
 
+    if (DIRECT_USE_CDSP == g_qnn_params.inference_approach) {
+        ggmlhexagon_close_cdsp(ctx);
+    }
+
     qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance;
     if (instance != nullptr) {
         std::map<std::string, qnn_singlenode_res_t>::iterator singlenode_graph_it;
@@ -3899,20 +4793,11 @@ static void ggml_backend_qnn_free(ggml_backend_t backend) {
     GGMLQNN_LOG_DEBUG("leave %s", __func__ );
 }
 
-//this is the first tech approach(or general approach in other ggml backends, such as ggml-sycl or ggml-cann)
 static enum ggml_status ggmlqnn_backend_graph_compute_general(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
     enum ggml_status result         = GGML_STATUS_SUCCESS;
     ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *)backend->context;
     GGML_UNUSED(ctx);
-#if 0
-    GGMLQNN_LOG_DEBUG("device %d", ctx->device);
-    GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
-    int num_nodes = std::min(5, cgraph->n_nodes);
-    for (int i = 0; i < num_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
-        GGMLQNN_LOG_DEBUG("%s: op %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
-    }
-#endif
+
     for (int i = 0; i < cgraph->n_nodes; i++) {
         ggml_tensor * node = cgraph->nodes[i];
         if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE
@@ -3989,7 +4874,7 @@ static enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_
     if (QNN_BACKEND_CPU == ctx->device)
         return GGML_BACKEND_DEVICE_TYPE_ACCEL;
     else if (QNN_BACKEND_GPU == ctx->device)
-        return GGML_BACKEND_DEVICE_TYPE_GPU;
+        return GGML_BACKEND_DEVICE_TYPE_ACCEL;
     else if (QNN_BACKEND_NPU == ctx->device)
         return GGML_BACKEND_DEVICE_TYPE_ACCEL;
     else
@@ -4197,8 +5082,10 @@ ggml_backend_reg_t ggml_backend_qnn_reg() {
 
     //case-2: normal scenario, such as llama-cli or UI applicaton
     ggmlqnn_load_cfg();
+    GGMLQNN_LOG_INFO("inference approach=%d(%s)", g_qnn_params.inference_approach,
+                     ggmlqnn_get_inference_approach_name(g_qnn_params.inference_approach));
     GGMLQNN_LOG_INFO("user's specified qnn_backend=%d", g_qnn_params.qnn_backend);
-    GGMLQNN_LOG_INFO("user's sepcified qnn runtime lib path=%s", g_qnn_params.qnn_runtimelib_path);
+    GGMLQNN_LOG_INFO("user's specified qnn runtime lib path=%s", g_qnn_params.qnn_runtimelib_path);
     if (g_qnn_params.qnn_backend >= GGML_QNN_MAX_DEVICES) {
         GGMLQNN_LOG_INFO("assume default ggml backend\n");
         GGMLQNN_LOG_DEBUG("leave ggml_backend_qnn_reg");
@@ -4234,6 +5121,58 @@ ggml_backend_reg_t ggml_backend_qnn_reg() {
     return &reg;
 }
 
+const char * ggml_backend_qnn_get_devname(size_t dev_num) {
+    if (DIRECT_USE_CDSP == g_qnn_params.inference_approach) {
+        if (dev_num == QNN_BACKEND_GGML)
+            return "ggml";
+        else
+            return "ggml-hexagon";
+    }
+
+    switch (dev_num) {
+        case QNN_BACKEND_CPU:
+            return "QNN-CPU";
+        case QNN_BACKEND_GPU:
+            return "QNN-GPU";
+        case QNN_BACKEND_NPU:
+            return "QNN-NPU";
+        case QNN_BACKEND_GGML:
+            return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML
+        default:
+            return "unknown";
+    }
+}
+
+static qnn_instance * ggmlqnn_init_qnn_instance(size_t device, const char * qnn_lib_path) {
+    int result = 0;
+    GGMLQNN_LOG_INFO("inference approach=%d(%s)", g_qnn_params.inference_approach,
+                     ggmlqnn_get_inference_approach_name(g_qnn_params.inference_approach));
+
+    qnn_instance * instance = nullptr;
+    instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, "");
+    result = instance->qnn_init(nullptr);
+    if (0 != result) {
+        GGMLQNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n",
+                         ggml_backend_qnn_get_devname(device));
+        delete instance;
+        return nullptr;
+    }
+    qnn_interface qnn_interface = instance->get_qnn_interface();
+    if (!qnn_interface.is_loaded()) {
+        GGMLQNN_LOG_WARN("qnn subsystem failure\n");
+        delete instance;
+        return nullptr;
+    }
+
+    std::string device_name = ggml_backend_qnn_get_devname(device);
+    GGMLQNN_LOG_INFO("qnn device name %s", device_name.c_str());
+    g_qnn_mgr[device].instance = instance;
+    g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface();
+    g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface();
+
+    return instance;
+}
+
 /**
  *
  * @param device            0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU
@@ -4257,69 +5196,27 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
         return nullptr;
     }
 
-    if (nullptr != g_qnn_mgr[device].backend) {
-        GGMLQNN_LOG_INFO("qnn backend %d(%s) already loaded", device, ggml_backend_qnn_get_devname(device));
-        GGMLQNN_LOG_INFO("leave %s\n", __func__);
-        return g_qnn_mgr[device].backend;
-    }
-
 #if defined(__ANDROID__)
     std::string path = qnn_lib_path;
     GGMLQNN_LOG_INFO("lib_path %s", path.c_str());
-    if (QNN_BACKEND_NPU == device) {
-        if (0 == setenv("LD_LIBRARY_PATH",
-                        (path +
-                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
-                        1)) {
-            GGMLQNN_LOG_INFO("QNN NPU backend setenv successfully");
-        } else {
-            GGMLQNN_LOG_ERROR("QNN NPU backend setenv failure");
-        }
-        if (0 == setenv("ADSP_LIBRARY_PATH",
-                        (path +
-                         ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(),
-                        1)) {
-            GGMLQNN_LOG_INFO("QNN NPU backend setenv successfully");
-        } else {
-            GGMLQNN_LOG_ERROR("QNN NPU backend setenv failure");
-        }
-    } else {
-        if (0 == setenv("LD_LIBRARY_PATH",
-                        (path +
-                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
-                        1)) {
-            GGMLQNN_LOG_INFO("%s backend setenv successfully\n", ggml_backend_qnn_get_devname(device));
-        } else {
-            GGMLQNN_LOG_ERROR("%s backend setenv failure\n", ggml_backend_qnn_get_devname(device));
-        }
-    }
+    ggmlqnn_set_runtime_path(device, path);
 #endif
 
-    qnn_instance * instance = nullptr;
-    instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, "");
-    result = instance->qnn_init(nullptr);
-    if (0 != result) {
-        GGMLQNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n", ggml_backend_qnn_get_devname(device));
-        delete instance;
-        return nullptr;
-    }
-    qnn_interface qnn_interface                             = instance->get_qnn_interface();
-    if (!qnn_interface.is_loaded()) {
-        GGMLQNN_LOG_WARN("qnn subsystem failure\n");
-        delete instance;
-        return nullptr;
+    if (nullptr != g_qnn_mgr[device].backend) {
+        GGMLQNN_LOG_INFO("backend %d(%s) already loaded", device,
+                         ggml_backend_qnn_get_devname(device));
+        GGMLQNN_LOG_INFO("leave %s\n", __func__);
+        return g_qnn_mgr[device].backend;
     }
 
-    std::string device_name = ggml_backend_qnn_get_devname(device);
-    GGMLQNN_LOG_INFO("qnn device name %s", device_name.c_str());
-    g_qnn_mgr[device].instance                  = instance;
-    g_qnn_mgr[device].raw_interface             = instance->get_qnn_raw_interface();
-    g_qnn_mgr[device].raw_system_interface      = instance->get_qnn_raw_system_interface();
+    qnn_instance * instance  = ggmlqnn_init_qnn_instance(device, qnn_lib_path);
+    if (nullptr == instance)
+        return nullptr;
 
-    if (0 == g_qnn_params.inference_approach) {
-        ggml_backend_qnn_interface.graph_compute = ggmlqnn_backend_graph_compute_general;
-    } else {
+    if (QNN_SINGLEGRAPH == g_qnn_params.inference_approach) {
         ggml_backend_qnn_interface.graph_compute = ggmlqnn_backend_graph_compute_special;
+    } else {
+        ggml_backend_qnn_interface.graph_compute = ggmlqnn_backend_graph_compute_general;
     }
 
     ggml_backend_t qnn_backend = new ggml_backend{
@@ -4329,7 +5226,16 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
             /* .context   = */ &g_qnn_mgr[device]
     };
 
-    g_qnn_mgr[device].backend   = qnn_backend;
+    g_qnn_mgr[device].backend = qnn_backend;
+    if (DIRECT_USE_CDSP == g_qnn_params.inference_approach) {
+        int result = ggmlhexagon_init_dsp(&g_qnn_mgr[device]);
+        if (0 != result) {
+            GGMLQNN_LOG_INFO("init hexagon dsp failure");
+            ggml_backend_qnn_free(qnn_backend);
+            return nullptr;
+        }
+    }
+
     GGMLQNN_LOG_INFO("leave %s\n", __func__);
 
     return qnn_backend;
@@ -4338,7 +5244,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
 GGML_BACKEND_DL_IMPL(ggml_backend_qnn_reg)
 
 // =================================================================================================
-//  section-9: general approach: offload GGML op to QNN backend
+//  section-9: general approach: offload GGML op to QNN backend or offload GGML op to Hexagon DSP directly
 // =================================================================================================
 static inline uint32_t ggmlqnn_get_tensor_data_size(const ggml_tensor * tensor) {
     /*
@@ -4370,7 +5276,7 @@ static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const
 }
 
 /*
- * provide a general skeleton to offload ggml op to QNN backend: peform element-wise operation on 1/2
+ * provide a general skeleton to offload ggml op to QNN backend or Hexagon cDSP: peform element-wise operation on 1/2
  * input tensors and 1 output tensors
 */
 static void ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
@@ -4393,20 +5299,25 @@ static void ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_ten
     std::string ggml_op_name_string             = std::string("ggml_") + ggml_op_name(op->op);
     const char * ggml_op_name                   = ggml_op_name_string.c_str();
 
-    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
-
     std::string graph_name;
     ggmlqnn_get_graphkey_from_op(op, graph_name);
 
     qnn_perf op_perf                            = qnn_perf(graph_name);
     op_perf.start();
 
+    if (DIRECT_USE_CDSP == g_qnn_params.inference_approach) {
+        ggmlhexagon_compute(ctx, op);
+        op_perf.info();
+        return;
+    }
+
+    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
     if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
         //retrieve computational resource from cached QNN graph
-        qnn_singlenode_res_t & graph_item  = ctx->qnn_singlenode_graph_map[graph_name];
-        graph_handle              = std::get<0>(graph_item);
-        qnn_ptensors_t & ptensors = std::get<1>(graph_item);
-        p_tensor0                 = ptensors[0];
+        qnn_singlenode_res_t & graph_item = ctx->qnn_singlenode_graph_map[graph_name];
+        graph_handle                      = std::get<0>(graph_item);
+        qnn_ptensors_t & ptensors         = std::get<1>(graph_item);
+        p_tensor0  = ptensors[0];
         if (2 == input_param_count) {
             p_tensor1 = ptensors[1];
             p_tensor2 = ptensors[2];
@@ -4415,10 +5326,12 @@ static void ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_ten
             p_tensor2 = ptensors[1];
         }
     } else {
-        GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str());
         GGML_ASSERT(instance->get_device_id() == ctx->device);
+        GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str());
         //create QNN graph
-        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), g_qnn_params.vtcm_size_in_mb, g_qnn_params.hvx_threads);
+        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device),
+                                         g_qnn_params.vtcm_size_in_mb,
+                                         g_qnn_params.hvx_threads);
         if (QNN_SUCCESS != error) {
             GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
             return;
@@ -4431,7 +5344,7 @@ static void ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_ten
         if (2 == input_param_count) {
             p_tensor1 = ggmlqnn_create_compute_tensor(instance, graph_handle, src1, QNN_TENSOR_TYPE_APP_WRITE);
         }
-        p_tensor2 = ggmlqnn_create_compute_tensor(instance, graph_handle, dst,  QNN_TENSOR_TYPE_APP_READ);
+        p_tensor2 = ggmlqnn_create_compute_tensor(instance, graph_handle, dst, QNN_TENSOR_TYPE_APP_READ);
 
         //compose QNN graph
         qnn_tensors_t input_tensors;
@@ -4443,25 +5356,12 @@ static void ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_ten
         Qnn_Tensor_t output_tensors[] = {
                 *p_tensor2
         };
-#if 0 // keep them for understand code easily
-        Qnn_OpConfig_t op_config = {
-                QNN_OPCONFIG_VERSION_1, {
-                        ggml_op_name,
-                        QNN_OP_PACKAGE_NAME_QTI_AISW,
-                        qnn_op_name,
-                        0,
-                        nullptr,
-                        input_param_count,
-                        tensor_inputs,
-                        1,
-                        tensor_outputs
-                }
-        };
-#else
-        Qnn_OpConfig_t op_config        = ggmlqnn_create_op_config(ggml_op_name, QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                                   qnn_op_name, nullptr, 0,
-                                                                   input_tensors.data(), input_param_count, output_tensors, 1);
-#endif
+        Qnn_OpConfig_t op_config = ggmlqnn_create_op_config(ggml_op_name,
+                                                            QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                            qnn_op_name, nullptr, 0,
+                                                            input_tensors.data(),
+                                                            input_param_count, output_tensors,
+                                                            1);
         CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
         //finalize QNN graph
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
@@ -4475,20 +5375,21 @@ static void ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_ten
             qnn_elementwise_tensors.push_back(p_tensor1);
         }
         qnn_elementwise_tensors.push_back(p_tensor2);
-        auto  graph_item = std::make_tuple(graph_handle, qnn_elementwise_tensors);
+        auto graph_item = std::make_tuple(graph_handle, qnn_elementwise_tensors);
         ctx->qnn_singlenode_graph_map[graph_name] = graph_item;
     }
 
     if (enable_npu_rpc) {
-        uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor0)->memHandle));
+        uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(
+                                     QNN_VER_PTR(*p_tensor0)->memHandle));
         GGMLQNN_LOG_DEBUG("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
         if (nullptr != qnn_buffer_0) {
             memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
         }
 
         if (2 == input_param_count) {
-            uint8_t *qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(
-                    QNN_VER_PTR(*p_tensor1)->memHandle));
+            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(
+                                         QNN_VER_PTR(*p_tensor1)->memHandle));
             GGMLQNN_LOG_DEBUG("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
             if (nullptr != qnn_buffer_1) {
                 memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
@@ -4784,7 +5685,6 @@ static void ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor
     instance                                    = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
 
-
     const enum ggml_type src0_type              = src0->type;
     const uint32_t src0_rank                    = ggml_n_dims(src0);
     const uint32_t src1_rank                    = ggml_n_dims(src1);
@@ -4802,39 +5702,51 @@ static void ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor
     qnn_perf op_perf                            = qnn_perf(graph_name);
     op_perf.start();
 
+    if (DIRECT_USE_CDSP == g_qnn_params.inference_approach) {
+        ggmlhexagon_compute(ctx, op);
+        op_perf.info();
+        return;
+    }
+
     void * wdata                                = ggmlqnn_type_trait(ctx, op);
     const size_t desired_size                   = ctx->desired_size;
 
     if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
         //retrieve computational resource from cached QNN graph
-        qnn_singlenode_res_t & graph_item  = ctx->qnn_singlenode_graph_map[graph_name];
-        graph_handle            = std::get<0>(graph_item);
-        qnn_ptensors_t & tensors = std::get<1>(graph_item);
-        p_tensor0               = tensors[0];
-        p_tensor1               = tensors[1];
-        p_tensor2               = tensors[2];
-        p_param_tensor          = tensors[3];
-        p_tensor2_transpose     = tensors[4];
+        qnn_singlenode_res_t & graph_item = ctx->qnn_singlenode_graph_map[graph_name];
+        graph_handle = std::get<0>(graph_item);
+        qnn_ptensors_t &tensors = std::get<1>(graph_item);
+        p_tensor0 = tensors[0];
+        p_tensor1 = tensors[1];
+        p_tensor2 = tensors[2];
+        p_param_tensor = tensors[3];
+        p_tensor2_transpose = tensors[4];
     } else {
         //create QNN graph
         GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str());
-        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), g_qnn_params.vtcm_size_in_mb, g_qnn_params.hvx_threads);
+        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device),
+                                         g_qnn_params.vtcm_size_in_mb,
+                                         g_qnn_params.hvx_threads);
         if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
+            GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n",
+                             graph_name.c_str(), error);
             return;
         }
         graph_handle = instance->get_qnn_graph_handle();
 
         //create computational tensor
         p_tensor0 = ggmlqnn_create_general_tensor(instance, graph_handle, src0, nullptr,
-                        QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank,
-                        nullptr, nullptr, 0);
+                                                  QNN_TENSOR_TYPE_APP_WRITE,
+                                                  QNN_DATATYPE_FLOAT_32, src0_rank,
+                                                  nullptr, nullptr, 0);
         p_tensor1 = ggmlqnn_create_general_tensor(instance, graph_handle, src1, nullptr,
-                        QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, src0_rank,
-                        nullptr, nullptr, 0);
+                                                  QNN_TENSOR_TYPE_APP_WRITE,
+                                                  QNN_DATATYPE_FLOAT_32, src0_rank,
+                                                  nullptr, nullptr, 0);
         p_tensor2 = ggmlqnn_create_general_tensor(instance, graph_handle, dst, nullptr,
-                        QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, src0_rank,
-                        nullptr, nullptr, 0);
+                                                  QNN_TENSOR_TYPE_APP_READ,
+                                                  QNN_DATATYPE_FLOAT_32, src0_rank,
+                                                  nullptr, nullptr, 0);
 
         //create param tensor for offload 2d/3d/4d matrix multiplication
         const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
@@ -4845,29 +5757,43 @@ static void ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor
         };
         uint32_t param_tensor_dims[1] = {src0_rank};
         p_param_tensor = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "param",
-                             QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1, param_tensor_dims,
-                             (void *)(param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
+                                                       QNN_TENSOR_TYPE_STATIC,
+                                                       QNN_DATATYPE_UINT_32, 1,
+                                                       param_tensor_dims,
+                                                       (void *) (param_tensor_data[src0_rank - 1]),
+                                                       src0_rank * sizeof(uint32_t));
 
         //create transpose tensor
-        p_tensor2_transpose = ggmlqnn_create_general_tensor(instance, graph_handle, dst, "transpose",
-                                  QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank,
-                                  nullptr, nullptr, 0, true);
+        p_tensor2_transpose = ggmlqnn_create_general_tensor(instance, graph_handle, dst,
+                                                            "transpose",
+                                                            QNN_TENSOR_TYPE_NATIVE,
+                                                            QNN_DATATYPE_FLOAT_32, src0_rank,
+                                                            nullptr, nullptr, 0, true);
 
         //compose QNN graph: add mulmat node
-        Qnn_Param_t out_0_params[]   = {{QNN_PARAMTYPE_SCALAR, QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, .scalarParam = {QNN_DATATYPE_BOOL_8, .bool8Value = 1}}};
-        Qnn_Tensor_t out_0_inputs[]  = {*p_tensor0, *p_tensor1};
+        Qnn_Param_t out_0_params[] = {
+                {QNN_PARAMTYPE_SCALAR, QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, .scalarParam = {
+                        QNN_DATATYPE_BOOL_8, .bool8Value = 1}}};
+        Qnn_Tensor_t out_0_inputs[] = {*p_tensor0, *p_tensor1};
         Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
-        Qnn_OpConfig_t out_0         = ggmlqnn_create_op_config("mulmat_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                           QNN_OP_MAT_MUL, out_0_params, 1, out_0_inputs, 2, out_0_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_0));
+        Qnn_OpConfig_t out_0 = ggmlqnn_create_op_config("mulmat_opconfig",
+                                                        QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                        QNN_OP_MAT_MUL, out_0_params, 1,
+                                                        out_0_inputs, 2, out_0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, out_0));
 
         //compose QNN graph: add transpose node
-        Qnn_Param_t out_trans1_0_params[]   = { {QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_param_tensor}};
-        Qnn_Tensor_t out_trans1_0_inputs[]  = {*p_tensor2_transpose};
+        Qnn_Param_t out_trans1_0_params[] = {
+                {QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_param_tensor}};
+        Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
         Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
-        Qnn_OpConfig_t out_trans1_0         = ggmlqnn_create_op_config("mulmat_transpose_opconfig", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                  QNN_OP_TRANSPOSE, out_trans1_0_params, 1, out_trans1_0_inputs, 1, out_trans1_0_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle,out_trans1_0));
+        Qnn_OpConfig_t out_trans1_0 = ggmlqnn_create_op_config("mulmat_transpose_opconfig",
+                                                               QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                               QNN_OP_TRANSPOSE,
+                                                               out_trans1_0_params, 1,
+                                                               out_trans1_0_inputs, 1,
+                                                               out_trans1_0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, out_trans1_0));
 
         //finalize QNN graph
         CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
@@ -4880,7 +5806,7 @@ static void ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor
         ggml_op_mulmat_tensors.push_back(p_tensor2);
         ggml_op_mulmat_tensors.push_back(p_param_tensor);
         ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
-        auto  graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+        auto graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
         ctx->qnn_singlenode_graph_map[graph_name] = graph_item;
     }
 
@@ -5032,7 +5958,7 @@ static void ggmlqnn_compute_rope(ggml_backend_qnn_context * ctx, ggml_tensor * d
 }
 
 // =================================================================================================
-//  section-10: second approach: mapping ggml computational cgraph to QNN graph
+//  section-10: special approach: mapping ggml computational cgraph to QNN graph
 // =================================================================================================
 // TODO: remove duplicated codes between section-9 and section-10
 // TODO: the graph algorithm in this section is naive, should optimized by AI experts
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop.h b/ggml/src/ggml-qnn/kernels/ggmlop.h
new file mode 100644
index 0000000000000..b45070c20001b
--- /dev/null
+++ b/ggml/src/ggml-qnn/kernels/ggmlop.h
@@ -0,0 +1,289 @@
+#ifndef _GGMLOP_H
+#define _GGMLOP_H
+/// @file ggmlop.idl
+///
+//qidl copyright
+//qidl nested=false
+#include <AEEStdDef.h>
+#include <remote.h>
+#include <string.h>
+#include <stdlib.h>
+
+#ifndef __QAIC_HEADER
+#define __QAIC_HEADER(ff) ff
+#endif //__QAIC_HEADER
+
+#ifndef __QAIC_HEADER_EXPORT
+#define __QAIC_HEADER_EXPORT
+#endif // __QAIC_HEADER_EXPORT
+
+#ifndef __QAIC_HEADER_ATTRIBUTE
+#define __QAIC_HEADER_ATTRIBUTE
+#endif // __QAIC_HEADER_ATTRIBUTE
+
+#ifndef __QAIC_IMPL
+#define __QAIC_IMPL(ff) ff
+#endif //__QAIC_IMPL
+
+#ifndef __QAIC_IMPL_EXPORT
+#define __QAIC_IMPL_EXPORT
+#endif // __QAIC_IMPL_EXPORT
+
+#ifndef __QAIC_IMPL_ATTRIBUTE
+#define __QAIC_IMPL_ATTRIBUTE
+#endif // __QAIC_IMPL_ATTRIBUTE
+#ifndef _QAIC_ENV_H
+#define _QAIC_ENV_H
+
+#include <stdio.h>
+#ifdef _WIN32
+#include "qtest_stdlib.h"
+#else
+#define MALLOC malloc
+#define FREE free
+#endif
+
+#ifdef __GNUC__
+#ifdef __clang__
+#pragma GCC diagnostic ignored "-Wunknown-pragmas"
+#else
+#pragma GCC diagnostic ignored "-Wpragmas"
+#endif
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+#ifndef _ATTRIBUTE_UNUSED
+
+#ifdef _WIN32
+#define _ATTRIBUTE_UNUSED
+#else
+#define _ATTRIBUTE_UNUSED __attribute__ ((unused))
+#endif
+
+#endif // _ATTRIBUTE_UNUSED
+
+#ifndef _ATTRIBUTE_VISIBILITY
+
+#ifdef _WIN32
+#define _ATTRIBUTE_VISIBILITY
+#else
+#define _ATTRIBUTE_VISIBILITY __attribute__ ((visibility("default")))
+#endif
+
+#endif // _ATTRIBUTE_VISIBILITY
+
+#ifndef __QAIC_REMOTE
+#define __QAIC_REMOTE(ff) ff
+#endif //__QAIC_REMOTE
+
+#ifndef __QAIC_HEADER
+#define __QAIC_HEADER(ff) ff
+#endif //__QAIC_HEADER
+
+#ifndef __QAIC_HEADER_EXPORT
+#define __QAIC_HEADER_EXPORT
+#endif // __QAIC_HEADER_EXPORT
+
+#ifndef __QAIC_HEADER_ATTRIBUTE
+#define __QAIC_HEADER_ATTRIBUTE
+#endif // __QAIC_HEADER_ATTRIBUTE
+
+#ifndef __QAIC_IMPL
+#define __QAIC_IMPL(ff) ff
+#endif //__QAIC_IMPL
+
+#ifndef __QAIC_IMPL_EXPORT
+#define __QAIC_IMPL_EXPORT
+#endif // __QAIC_IMPL_EXPORT
+
+#ifndef __QAIC_IMPL_ATTRIBUTE
+#define __QAIC_IMPL_ATTRIBUTE
+#endif // __QAIC_IMPL_ATTRIBUTE
+
+#ifndef __QAIC_STUB
+#define __QAIC_STUB(ff) ff
+#endif //__QAIC_STUB
+
+#ifndef __QAIC_STUB_EXPORT
+#define __QAIC_STUB_EXPORT
+#endif // __QAIC_STUB_EXPORT
+
+#ifndef __QAIC_STUB_ATTRIBUTE
+#define __QAIC_STUB_ATTRIBUTE
+#endif // __QAIC_STUB_ATTRIBUTE
+
+#ifndef __QAIC_SKEL
+#define __QAIC_SKEL(ff) ff
+#endif //__QAIC_SKEL__
+
+#ifndef __QAIC_SKEL_EXPORT
+#define __QAIC_SKEL_EXPORT
+#endif // __QAIC_SKEL_EXPORT
+
+#ifndef __QAIC_SKEL_ATTRIBUTE
+#define __QAIC_SKEL_ATTRIBUTE
+#endif // __QAIC_SKEL_ATTRIBUTE
+
+#ifdef __QAIC_DEBUG__
+   #ifndef __QAIC_DBG_PRINTF__
+   #include <stdio.h>
+   #define __QAIC_DBG_PRINTF__( ee ) do { printf ee ; } while(0)
+   #endif
+#else
+   #define __QAIC_DBG_PRINTF__( ee ) (void)0
+#endif
+
+
+#define _OFFSET(src, sof)  ((void*)(((char*)(src)) + (sof)))
+
+#define _COPY(dst, dof, src, sof, sz)  \
+   do {\
+         struct __copy { \
+            char ar[sz]; \
+         };\
+         *(struct __copy*)_OFFSET(dst, dof) = *(struct __copy*)_OFFSET(src, sof);\
+   } while (0)
+
+#define _COPYIF(dst, dof, src, sof, sz)  \
+   do {\
+      if(_OFFSET(dst, dof) != _OFFSET(src, sof)) {\
+         _COPY(dst, dof, src, sof, sz); \
+      } \
+   } while (0)
+
+_ATTRIBUTE_UNUSED
+static __inline void _qaic_memmove(void* dst, void* src, int size) {
+   int i = 0;
+   for(i = 0; i < size; ++i) {
+      ((char*)dst)[i] = ((char*)src)[i];
+   }
+}
+
+#define _MEMMOVEIF(dst, src, sz)  \
+   do {\
+      if(dst != src) {\
+         _qaic_memmove(dst, src, sz);\
+      } \
+   } while (0)
+
+
+#define _ASSIGN(dst, src, sof)  \
+   do {\
+      dst = OFFSET(src, sof); \
+   } while (0)
+
+#define _STD_STRLEN_IF(str) (str == 0 ? 0 : strlen(str))
+
+#include "AEEStdErr.h"
+
+#ifdef _WIN32
+#define _QAIC_FARF(level, msg, ...) (void)0
+#else
+#define _QAIC_FARF(level, msg, ...) \
+   do {\
+      if(0 == (HAP_debug_v2) ) {\
+         (void)0; \
+      } else { \
+         FARF(level, msg , ##__VA_ARGS__); \
+      } \
+   }while(0)
+#endif //_WIN32 for _QAIC_FARF
+
+#define _TRY(ee, func) \
+   do { \
+      if (AEE_SUCCESS != ((ee) = func)) {\
+         __QAIC_DBG_PRINTF__((__FILE__ ":%d:error:%d:%s\n", __LINE__, (int)(ee),#func));\
+         goto ee##bail;\
+      } \
+   } while (0)
+
+#define _TRY_FARF(ee, func) \
+   do { \
+      if (AEE_SUCCESS != ((ee) = func)) {\
+         goto ee##farf##bail;\
+      } \
+   } while (0)
+
+#define _QAIC_CATCH(exception) exception##bail: if (exception != AEE_SUCCESS)
+
+#define _CATCH_FARF(exception) exception##farf##bail: if (exception != AEE_SUCCESS)
+
+#define _QAIC_ASSERT(nErr, ff) _TRY(nErr, 0 == (ff) ? AEE_EBADPARM : AEE_SUCCESS)
+
+#ifdef __QAIC_DEBUG__
+#define _QAIC_ALLOCATE(nErr, pal, size, alignment, pv) _TRY(nErr, _allocator_alloc(pal, __FILE_LINE__, size, alignment, (void**)&pv));\
+                                                  _QAIC_ASSERT(nErr,pv || !(size))
+#else
+#define _QAIC_ALLOCATE(nErr, pal, size, alignment, pv) _TRY(nErr, _allocator_alloc(pal, 0, size, alignment, (void**)&pv));\
+                                                  _QAIC_ASSERT(nErr,pv || !(size))
+#endif
+
+
+#endif // _QAIC_ENV_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if !defined(__QAIC_STRING1_OBJECT_DEFINED__) && !defined(__STRING1_OBJECT__)
+#define __QAIC_STRING1_OBJECT_DEFINED__
+#define __STRING1_OBJECT__
+typedef struct _cstring1_s {
+   char* data;
+   int dataLen;
+} _cstring1_t;
+
+#endif /* __QAIC_STRING1_OBJECT_DEFINED__ */
+/// Enabling stub-skel mismatch check feature in the auto-gen files.
+/// Please refer to the IDL documentation for more details on the feature.
+/// It is fully supported only on Kailua and later targets.
+#define IDL_VERSION "0.0.1"
+typedef struct dsptensor dsptensor;
+struct dsptensor {
+   int64_t ne[4];
+   int64_t nb[4];
+   int32_t flags;
+   int32_t type;
+   float* data;
+   int dataLen;
+};
+/**
+    * Opens the handle in the specified domain.  If this is the first
+    * handle, this creates the session.  Typically this means opening
+    * the device, aka open("/dev/adsprpc-smd"), then calling ioctl
+    * device APIs to create a PD on the DSP to execute our code in,
+    * then asking that PD to dlopen the .so and dlsym the skel function.
+    *
+    * @param uri, <interface>_URI"&_dom=aDSP"
+    *    <interface>_URI is a QAIC generated uri, or
+    *    "file:///<sofilename>?<interface>_skel_handle_invoke&_modver=1.0"
+    *    If the _dom parameter is not present, _dom=DEFAULT is assumed
+    *    but not forwarded.
+    *    Reserved uri keys:
+    *      [0]: first unamed argument is the skel invoke function
+    *      _dom: execution domain name, _dom=mDSP/aDSP/DEFAULT
+    *      _modver: module version, _modver=1.0
+    *      _*: any other key name starting with an _ is reserved
+    *    Unknown uri keys/values are forwarded as is.
+    * @param h, resulting handle
+    * @retval, 0 on success
+    */
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_open)(const char* uri, remote_handle64* h) __QAIC_HEADER_ATTRIBUTE;
+/**
+    * Closes a handle.  If this is the last handle to close, the session
+    * is closed as well, releasing all the allocated resources.
+
+    * @param h, the handle to close
+    * @retval, 0 on success, should always succeed
+    */
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_close)(remote_handle64 h) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_add)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_mulmat)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
+#ifndef ggmlop_URI
+#define ggmlop_URI "file:///libggmlop_skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1"
+#endif /*ggmlop_URI*/
+#ifdef __cplusplus
+}
+#endif
+#endif //_GGMLOP_H
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
new file mode 100644
index 0000000000000..0350942648e2d
--- /dev/null
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
@@ -0,0 +1,237 @@
+/* ggml op functions, running on Hexagon cDSP as libggmlop_skel.so
+ *
+ * currently I didn't find a general approach to compile/build this hexagon-kernel file, a manual build approach can works fine in my local dev envs. I'm working on this build issue.
+ *
+ */
+
+#if 0
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include "HAP_farf.h"
+#include "ggmlop.h"
+
+#define GGML_ASSERT(x)  do { } while(0)
+#define MIN(a, b)       ((a) < (b) ? (a) : (b))
+#define GGML_RESTRICT
+
+int ggmlop_open(const char * uri, remote_handle64 * handle) {
+    void * tptr = NULL;
+    FARF(HIGH, "uri %s", uri);
+    tptr = (void *)malloc(1);
+    *handle = (remote_handle64)tptr;
+    assert(*handle);
+    return 0;
+}
+
+int ggmlop_close(remote_handle64 handle) {
+    if (handle)
+        free((void*)handle);
+    return 0;
+}
+
+int ggmlop_add(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
+    FARF(HIGH, "===============     DSP: ggmlop_add ");
+    for (size_t idx = 0; idx < src0->dataLen; idx++) {
+        dst->data[idx] = src0->data[idx] + src1->data[idx];
+    }
+
+    return 0;
+}
+
+static void ggmldsp_dump_tensor(struct dsptensor * src0) {
+    FARF(HIGH, "ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)\n",
+         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
+         src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
+}
+
+static int ggmldsp_is_contiguous(const struct dsptensor * tensor) {
+    int n = 0;
+    size_t next_nb = sizeof(float);
+    if (tensor->ne[0] != 1 && tensor->nb[0] != next_nb) {
+        return 0;
+    }
+    next_nb *= tensor->ne[0];
+    for (int i = 1; i < 4; i++) {
+        if (tensor->ne[i] != 1) {
+            if (i > n) {
+                if (tensor->nb[i] != next_nb) {
+                    return 0;
+                }
+                next_nb *= tensor->ne[i];
+            } else {
+                next_nb = tensor->ne[i] * tensor->nb[i];
+            }
+        }
+    }
+    return 1;
+}
+
+//FIXME: unknown issue on cDSP
+int ggmlop_mulmat(remote_handle64 h, const struct dsptensor * src00, const struct dsptensor * src10, dsptensor * dst) {
+    FARF(HIGH, "===============     DSP: ggmlop_mulmat ");
+
+    dsptensor * src0 = (dsptensor*)src00;
+    dsptensor * src1 = (dsptensor*)src10;
+    const int64_t ne00 = src0->ne[0];
+    (void) (ne00);
+    const int64_t ne01 = (src0)->ne[1];
+    (void) (ne01);
+    const int64_t ne02 = (src0)->ne[2];
+    (void) (ne02);
+    const int64_t ne03 = (src0)->ne[3];
+    (void) (ne03);
+    const size_t nb00 = (src0)->nb[0];
+    (void) (nb00);
+    const size_t nb01 = (src0)->nb[1];
+    (void) (nb01);
+    const size_t nb02 = (src0)->nb[2];
+    (void) (nb02);
+    const size_t nb03 = (src0)->nb[3];
+    (void) (nb03);
+    const int64_t ne10 = (src1)->ne[0];
+    (void) (ne10);
+    const int64_t ne11 = (src1)->ne[1];
+    (void) (ne11);
+    const int64_t ne12 = (src1)->ne[2];
+    (void) (ne12);
+    const int64_t ne13 = (src1)->ne[3];
+    (void) (ne13);
+    const size_t nb10 = (src1)->nb[0];
+    (void) (nb10);
+    const size_t nb11 = (src1)->nb[1];
+    (void) (nb11);
+    const size_t nb12 = (src1)->nb[2];
+    (void) (nb12);
+    const size_t nb13 = (src1)->nb[3];
+    (void) (nb13);
+    const int64_t ne0 = (dst)->ne[0];
+    (void) (ne0);
+    const int64_t ne1 = (dst)->ne[1];
+    (void) (ne1);
+    const int64_t ne2 = (dst)->ne[2];
+    (void) (ne2);
+    const int64_t ne3 = (dst)->ne[3];
+    (void) (ne3);
+    const size_t nb0 = (dst)->nb[0];
+    (void) (nb0);
+    const size_t nb1 = (dst)->nb[1];
+    (void) (nb1);
+    const size_t nb2 = (dst)->nb[2];
+    (void) (nb2);
+    const size_t nb3 = (dst)->nb[3];
+    (void) (nb3);
+
+    ggmldsp_dump_tensor(src0);
+    ggmldsp_dump_tensor(src1);
+
+    const int vec_dot_type = 0;
+    int64_t const vec_dot_num_rows = 1;
+
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+
+    GGML_ASSERT(nb00 == sizeof(float));
+    GGML_ASSERT(nb10 == sizeof(float));
+
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+    const int64_t nr0 = ne0;
+    const int64_t nr1 = ne1 * ne2 * ne3;
+
+    int chunk_size = 16;
+    if (nr0 == 1 || nr1 == 1) {
+        chunk_size = 64;
+    }
+    int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
+    int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
+
+    if (nchunk0 * nchunk1 < nth * 4) {
+        nchunk0 = nr0 > nr1 ? nth : 1;
+        nchunk1 = nr0 > nr1 ? 1 : nth;
+    }
+    const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
+    const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
+
+    int current_chunk = 0;
+
+    const int64_t ith0 = current_chunk % nchunk0;
+    const int64_t ith1 = current_chunk / nchunk0;
+
+    const int64_t ir0_start = dr0 * ith0;
+    const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
+
+    const int64_t ir1_start = dr1 * ith1;
+    const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
+
+    int64_t num_rows_per_vec_dot = vec_dot_num_rows;
+
+    const int src1_cont = ggmldsp_is_contiguous(src1);
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
+    const void * wdata = src1->data;
+    const size_t row_size = sizeof(float) * ne10;
+    assert(ne12 % ne02 == 0);
+    assert(ne13 % ne03 == 0);
+
+    const int64_t blck_0 = 16;
+    const int64_t blck_1 = 16;
+
+    float tmp[32];
+
+    for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
+        for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
+            for (int64_t ir1 = iir1;
+                    ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
+                const int64_t i13 = (ir1 / (ne12 * ne1));
+                const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
+                const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
+
+                const int64_t i03 = i13 / r3;
+                const int64_t i02 = i12 / r2;
+
+                const int64_t i1 = i11;
+                const int64_t i2 = i12;
+                const int64_t i3 = i13;
+
+                const char * src0_row = (const char *)src0->data + (0 + i02 * nb02 + i03 * nb03);
+
+                const char * src1_col = (const char *)wdata +
+                                       (src1_cont || src1->type != vec_dot_type
+                                        ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
+                                        : (i11 * nb11 + i12 * nb12 + i13 * nb13));
+                float * dst_col = (float *)((char *) dst->data +
+                                            (i1 * nb1 + i2 * nb2 + i3 * nb3));
+
+
+                for (int64_t ir0 = iir0;
+                        ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
+
+                    float sumf = 0.0;
+                    const float * GGML_RESTRICT x = (float*)src0_row + ir0 * nb01;
+                    const float * GGML_RESTRICT y = (float*)src1_col;
+                    float * GGML_RESTRICT s = &tmp[ir0 - iir0];
+                    for (int i = 0; i < ne00; i++) {
+                        sumf += x[i] * y[i];
+                    }
+                    *s = sumf;
+
+                }
+
+                for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
+                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16),
+                           (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+#endif
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_stub.c b/ggml/src/ggml-qnn/kernels/ggmlop_stub.c
new file mode 100644
index 0000000000000..6313348d7ea2d
--- /dev/null
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_stub.c
@@ -0,0 +1,437 @@
+#ifndef _GGMLOP_STUB_H
+#define _GGMLOP_STUB_H
+/// @file ggmlop.idl
+///
+//qidl copyright
+//qidl nested=false
+#include "ggmlop.h"
+#include <string.h>
+#ifndef _WIN32
+#include "HAP_farf.h"
+#include <inttypes.h>
+#endif //_WIN32 for HAP_farf
+#ifndef _ALLOCATOR_H
+#define _ALLOCATOR_H
+
+#include <stdlib.h>
+#include <stdint.h>
+
+typedef struct _heap _heap;
+struct _heap {
+   _heap* pPrev;
+   const char* loc;
+   uint64_t buf;
+};
+
+typedef struct _allocator {
+   _heap* pheap;
+   uint8_t* stack;
+   uint8_t* stackEnd;
+   int nSize;
+} _allocator;
+
+_ATTRIBUTE_UNUSED
+static __inline int _heap_alloc(_heap** ppa, const char* loc, int size, void** ppbuf) {
+   _heap* pn = 0;
+   pn = MALLOC((size_t)size + sizeof(_heap) - sizeof(uint64_t));
+   if(pn != 0) {
+      pn->pPrev = *ppa;
+      pn->loc = loc;
+      *ppa = pn;
+      *ppbuf = (void*)&(pn->buf);
+      return 0;
+   } else {
+      return -1;
+   }
+}
+#define _ALIGN_SIZE(x, y) (((x) + (y-1)) & ~(y-1))
+
+_ATTRIBUTE_UNUSED
+static __inline int _allocator_alloc(_allocator* me,
+                                    const char* loc,
+                                    int size,
+                                    unsigned int al,
+                                    void** ppbuf) {
+   if(size < 0) {
+      return -1;
+   } else if (size == 0) {
+      *ppbuf = 0;
+      return 0;
+   }
+   if((_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + (size_t)size) < (uintptr_t)me->stack + (size_t)me->nSize) {
+      *ppbuf = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al);
+      me->stackEnd = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + size;
+      return 0;
+   } else {
+      return _heap_alloc(&me->pheap, loc, size, ppbuf);
+   }
+}
+
+_ATTRIBUTE_UNUSED
+static __inline void _allocator_deinit(_allocator* me) {
+   _heap* pa = me->pheap;
+   while(pa != 0) {
+      _heap* pn = pa;
+      const char* loc = pn->loc;
+      (void)loc;
+      pa = pn->pPrev;
+      FREE(pn);
+   }
+}
+
+_ATTRIBUTE_UNUSED
+static __inline void _allocator_init(_allocator* me, uint8_t* stack, int stackSize) {
+   me->stack =  stack;
+   me->stackEnd =  stack + stackSize;
+   me->nSize = stackSize;
+   me->pheap = 0;
+}
+
+
+#endif // _ALLOCATOR_H
+
+#ifndef SLIM_H
+#define SLIM_H
+
+#include <stdint.h>
+
+//a C data structure for the idl types that can be used to implement
+//static and dynamic language bindings fairly efficiently.
+//
+//the goal is to have a minimal ROM and RAM footprint and without
+//doing too many allocations.  A good way to package these things seemed
+//like the module boundary, so all the idls within  one module can share
+//all the type references.
+
+
+#define PARAMETER_IN       0x0
+#define PARAMETER_OUT      0x1
+#define PARAMETER_INOUT    0x2
+#define PARAMETER_ROUT     0x3
+#define PARAMETER_INROUT   0x4
+
+//the types that we get from idl
+#define TYPE_OBJECT             0x0
+#define TYPE_INTERFACE          0x1
+#define TYPE_PRIMITIVE          0x2
+#define TYPE_ENUM               0x3
+#define TYPE_STRING             0x4
+#define TYPE_WSTRING            0x5
+#define TYPE_STRUCTURE          0x6
+#define TYPE_UNION              0x7
+#define TYPE_ARRAY              0x8
+#define TYPE_SEQUENCE           0x9
+
+//these require the pack/unpack to recurse
+//so it's a hint to those languages that can optimize in cases where
+//recursion isn't necessary.
+#define TYPE_COMPLEX_STRUCTURE  (0x10 | TYPE_STRUCTURE)
+#define TYPE_COMPLEX_UNION      (0x10 | TYPE_UNION)
+#define TYPE_COMPLEX_ARRAY      (0x10 | TYPE_ARRAY)
+#define TYPE_COMPLEX_SEQUENCE   (0x10 | TYPE_SEQUENCE)
+
+
+typedef struct Type Type;
+
+#define INHERIT_TYPE\
+   int32_t nativeSize;                /*in the simple case its the same as wire size and alignment*/\
+   union {\
+      struct {\
+         const uintptr_t         p1;\
+         const uintptr_t         p2;\
+      } _cast;\
+      struct {\
+         uint32_t  iid;\
+         uint32_t  bNotNil;\
+      } object;\
+      struct {\
+         const Type  *arrayType;\
+         int32_t      nItems;\
+      } array;\
+      struct {\
+         const Type *seqType;\
+         int32_t      nMaxLen;\
+      } seqSimple; \
+      struct {\
+         uint32_t bFloating;\
+         uint32_t bSigned;\
+      } prim; \
+      const SequenceType* seqComplex;\
+      const UnionType  *unionType;\
+      const StructType *structType;\
+      int32_t         stringMaxLen;\
+      uint8_t        bInterfaceNotNil;\
+   } param;\
+   uint8_t    type;\
+   uint8_t    nativeAlignment\
+
+typedef struct UnionType UnionType;
+typedef struct StructType StructType;
+typedef struct SequenceType SequenceType;
+struct Type {
+   INHERIT_TYPE;
+};
+
+struct SequenceType {
+   const Type *         seqType;
+   uint32_t               nMaxLen;
+   uint32_t               inSize;
+   uint32_t               routSizePrimIn;
+   uint32_t               routSizePrimROut;
+};
+
+//byte offset from the start of the case values for
+//this unions case value array.  it MUST be aligned
+//at the alignment requrements for the descriptor
+//
+//if negative it means that the unions cases are
+//simple enumerators, so the value read from the descriptor
+//can be used directly to find the correct case
+typedef union CaseValuePtr CaseValuePtr;
+union CaseValuePtr {
+   const uint8_t*   value8s;
+   const uint16_t*  value16s;
+   const uint32_t*  value32s;
+   const uint64_t*  value64s;
+};
+
+//these are only used in complex cases
+//so I pulled them out of the type definition as references to make
+//the type smaller
+struct UnionType {
+   const Type           *descriptor;
+   uint32_t               nCases;
+   const CaseValuePtr   caseValues;
+   const Type * const   *cases;
+   int32_t               inSize;
+   int32_t               routSizePrimIn;
+   int32_t               routSizePrimROut;
+   uint8_t                inAlignment;
+   uint8_t                routAlignmentPrimIn;
+   uint8_t                routAlignmentPrimROut;
+   uint8_t                inCaseAlignment;
+   uint8_t                routCaseAlignmentPrimIn;
+   uint8_t                routCaseAlignmentPrimROut;
+   uint8_t                nativeCaseAlignment;
+   uint8_t              bDefaultCase;
+};
+
+struct StructType {
+   uint32_t               nMembers;
+   const Type * const   *members;
+   int32_t               inSize;
+   int32_t               routSizePrimIn;
+   int32_t               routSizePrimROut;
+   uint8_t                inAlignment;
+   uint8_t                routAlignmentPrimIn;
+   uint8_t                routAlignmentPrimROut;
+};
+
+typedef struct Parameter Parameter;
+struct Parameter {
+   INHERIT_TYPE;
+   uint8_t    mode;
+   uint8_t  bNotNil;
+};
+
+#define SLIM_IFPTR32(is32,is64) (sizeof(uintptr_t) == 4 ? (is32) : (is64))
+#define SLIM_SCALARS_IS_DYNAMIC(u) (((u) & 0x00ffffff) == 0x00ffffff)
+
+typedef struct Method Method;
+struct Method {
+   uint32_t                    uScalars;            //no method index
+   int32_t                     primInSize;
+   int32_t                     primROutSize;
+   int                         maxArgs;
+   int                         numParams;
+   const Parameter * const     *params;
+   uint8_t                       primInAlignment;
+   uint8_t                       primROutAlignment;
+};
+
+typedef struct Interface Interface;
+
+struct Interface {
+   int                            nMethods;
+   const Method  * const          *methodArray;
+   int                            nIIds;
+   const uint32_t                   *iids;
+   const uint16_t*                  methodStringArray;
+   const uint16_t*                  methodStrings;
+   const char*                    strings;
+};
+
+
+#endif //SLIM_H
+
+
+#ifndef _GGMLOP_SLIM_H
+#define _GGMLOP_SLIM_H
+#include <stdint.h>
+
+#ifndef __QAIC_SLIM
+#define __QAIC_SLIM(ff) ff
+#endif
+#ifndef __QAIC_SLIM_EXPORT
+#define __QAIC_SLIM_EXPORT
+#endif
+
+static const Type types[5];
+static const Type* const typeArrays[5] = {&(types[0]),&(types[0]),&(types[2]),&(types[2]),&(types[3])};
+static const StructType structTypes[1] = {{0x5,&(typeArrays[0]),0x50,0x4,0x48,0x8,0x4,0x8}};
+static const Type types[5] = {{0x20,{{(const uintptr_t)&(types[1]),(const uintptr_t)0x4}}, 8,0x8},{0x8,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x8},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[4]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
+static const Parameter parameters[5] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x50,0x58),{{(const uintptr_t)&(structTypes[0]),0}}, 22,0x8,0,0},{SLIM_IFPTR32(0x50,0x58),{{(const uintptr_t)&(structTypes[0]),0}}, 22,0x8,3,0}};
+static const Parameter* const parameterArrays[6] = {(&(parameters[3])),(&(parameters[3])),(&(parameters[4])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))};
+static const Method methods[3] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[3])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[5])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0xa4,0x48,3,3,(&(parameterArrays[0])),0x8,0x8}};
+static const Method* const methodArrays[4] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[2])};
+static const char strings[65] = "mulmat\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0add\0uri\0nb\0ne\0h\0";
+static const uint16_t methodStrings[43] = {0,34,59,56,7,29,24,19,59,56,7,29,24,44,59,56,7,29,24,48,34,59,56,7,29,24,19,59,56,7,29,24,44,59,56,7,29,24,39,52,62,13,62};
+static const uint16_t methodStringsArrays[4] = {38,41,19,0};
+__QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {4,&(methodArrays[0]),0,0,&(methodStringsArrays [0]),methodStrings,strings};
+#endif //_GGMLOP_SLIM_H
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_open)(const char* uri, remote_handle64* h) __QAIC_STUB_ATTRIBUTE {
+   return __QAIC_REMOTE(remote_handle64_open)(uri, h);
+}
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_close)(remote_handle64 h) __QAIC_STUB_ATTRIBUTE {
+   return __QAIC_REMOTE(remote_handle64_close)(h);
+}
+static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint64_t _rout0[4], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[1], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED char* _rout4[1], _ATTRIBUTE_UNUSED uint32_t _rout4Len[1]) {
+   int _nErr = 0;
+   remote_arg* _praROutPostStart = _praROutPost;
+   remote_arg** _ppraROutPostStart = _ppraROutPost;
+   _ppraROutPost = &_praROutPost;
+   _COPY(_rout0, 0, _primROut, 0, 32);
+   _COPY(_rout1, 0, _primROut, 32, 32);
+   _COPY(_rout2, 0, _primROut, 64, 4);
+   _COPY(_rout3, 0, _primROut, 68, 4);
+   _ppraROutPostStart[0] += (_praROutPost - _praROutPostStart) +1;
+   return _nErr;
+}
+static __inline int _stub_pack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint64_t _rout0[4], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[1], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED char* _rout4[1], _ATTRIBUTE_UNUSED uint32_t _rout4Len[1]) {
+   int _nErr = 0;
+   remote_arg* _praInStart = _praIn;
+   remote_arg** _ppraInStart = _ppraIn;
+   remote_arg* _praROutStart = _praROut;
+   remote_arg** _ppraROutStart = _ppraROut;
+   _ppraIn = &_praIn;
+   _ppraROut = &_praROut;
+   _COPY(_primIn, 0, _rout4Len, 0, 4);
+   _praROut[0].buf.pv = _rout4[0];
+   _praROut[0].buf.nLen = (4 * _rout4Len[0]);
+   _ppraInStart[0] += (_praIn - _praInStart) + 0;
+   _ppraROutStart[0] += (_praROut - _praROutStart) +1;
+   return _nErr;
+}
+static __inline int _stub_pack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint64_t _in0[4], _ATTRIBUTE_UNUSED uint64_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[1], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED char* _in4[1], _ATTRIBUTE_UNUSED uint32_t _in4Len[1]) {
+   int _nErr = 0;
+   remote_arg* _praInStart = _praIn;
+   remote_arg** _ppraInStart = _ppraIn;
+   remote_arg* _praROutStart = _praROut;
+   remote_arg** _ppraROutStart = _ppraROut;
+   _ppraIn = &_praIn;
+   _ppraROut = &_praROut;
+   _COPY(_primIn, 0, _in0, 0, 32);
+   _COPY(_primIn, 32, _in1, 0, 32);
+   _COPY(_primIn, 64, _in2, 0, 4);
+   _COPY(_primIn, 68, _in3, 0, 4);
+   _COPY(_primIn, 72, _in4Len, 0, 4);
+   _praIn[0].buf.pv = (void*) _in4[0];
+   _praIn[0].buf.nLen = (4 * _in4Len[0]);
+   _ppraInStart[0] += (_praIn - _praInStart) + 1;
+   _ppraROutStart[0] += (_praROut - _praROutStart) +0;
+   return _nErr;
+}
+static __inline void _count(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint64_t _rout0[4], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[1], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED char* _rout4[1], _ATTRIBUTE_UNUSED uint32_t _rout4Len[1]) {
+   _numIn[0] += 0;
+   _numROut[0] += 1;
+   _numInH[0] += 0;
+   _numROutH[0] += 0;
+}
+static __inline void _count_1(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint64_t _in0[4], _ATTRIBUTE_UNUSED uint64_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[1], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED char* _in4[1], _ATTRIBUTE_UNUSED uint32_t _in4Len[1]) {
+   _numIn[0] += 1;
+   _numROut[0] += 0;
+   _numInH[0] += 0;
+   _numROutH[0] += 0;
+}
+static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint64_t _in0[SLIM_IFPTR32(10, 11)], uint64_t _in1[SLIM_IFPTR32(10, 11)], uint64_t _rout2[SLIM_IFPTR32(10, 11)]) {
+   remote_arg* _pra = 0;
+   int _numIn[1] = {0};
+   int _numROut[1] = {0};
+   int _numInH[1] = {0};
+   int _numROutH[1] = {0};
+   _allocator _al[1] = {{0}};
+   uint64_t _primIn[21]= {0};
+   uint64_t _primROut[9]= {0};
+   remote_arg* _praIn = 0;
+   remote_arg* _praROut = 0;
+   remote_arg* _praROutPost = 0;
+   remote_arg** _ppraROutPost = &_praROutPost;
+   remote_arg** _ppraIn = &_praIn;
+   remote_arg** _ppraROut = &_praROut;
+   remote_arg* _praHIn = 0;
+   remote_arg** _ppraHIn = &_praHIn;
+   remote_arg* _praHROut = 0;
+   remote_arg** _ppraHROut = &_praHROut;
+   int _nErr = 0;
+   _numIn[0] = 0;
+   _numROut[0] = 0;
+   _numInH[0] = 0;
+   _numROutH[0] = 0;
+   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint64_t*)&(((uint64_t*)_in0)[0]), (uint64_t*)&(((uint64_t*)_in0)[4]), (uint32_t*)&(((uint32_t*)_in0)[16]), (uint32_t*)&(((uint32_t*)_in0)[17]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[18]), (char**)&(((uint64_t*)_in0)[9])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[19]), (uint32_t*)&(((uint32_t*)_in0)[20])));
+   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint64_t*)&(((uint64_t*)_in1)[0]), (uint64_t*)&(((uint64_t*)_in1)[4]), (uint32_t*)&(((uint32_t*)_in1)[16]), (uint32_t*)&(((uint32_t*)_in1)[17]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[18]), (char**)&(((uint64_t*)_in1)[9])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[19]), (uint32_t*)&(((uint32_t*)_in1)[20])));
+   _count(_numIn, _numROut, _numInH, _numROutH, (uint64_t*)&(((uint64_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[4]), (uint32_t*)&(((uint32_t*)_rout2)[16]), (uint32_t*)&(((uint32_t*)_rout2)[17]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[18]), (char**)&(((uint64_t*)_rout2)[9])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[19]), (uint32_t*)&(((uint32_t*)_rout2)[20])));
+   if(_numIn[0]>=255){
+          _QAIC_FARF(RUNTIME_ERROR, "ERROR: Unsupported number of input buffers\n");
+          return AEE_EUNSUPPORTED;
+   }
+   if(_numROut[0]>=255){
+          _QAIC_FARF(RUNTIME_ERROR, "ERROR: Unsupported number of output buffers\n");
+          return AEE_EUNSUPPORTED;
+   }
+   _allocator_init(_al, 0, 0);
+   _QAIC_ALLOCATE(_nErr, _al, ((((((((_numIn[0] + _numROut[0]) + _numInH[0]) + _numROutH[0]) + 1) + 1) + 0) + 0) * sizeof(_pra[0])), 4, _pra);
+   _QAIC_ASSERT(_nErr, _pra);
+   _pra[0].buf.pv = (void*)_primIn;
+   _pra[0].buf.nLen = sizeof(_primIn);
+   _pra[(_numIn[0] + 1)].buf.pv = (void*)_primROut;
+   _pra[(_numIn[0] + 1)].buf.nLen = sizeof(_primROut);
+   _praIn = (_pra + 1);
+   _praROut = (_praIn + _numIn[0] + 1);
+   _praROutPost = _praROut;
+   if(_praHIn == 0)
+   {
+      _praHIn = ((_praROut + _numROut[0]) + 1);
+   }
+   if(_praHROut == 0)
+      (_praHROut = _praHIn + _numInH[0] + 0);
+   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint64_t*)&(((uint64_t*)_in0)[0]), (uint64_t*)&(((uint64_t*)_in0)[4]), (uint32_t*)&(((uint32_t*)_in0)[16]), (uint32_t*)&(((uint32_t*)_in0)[17]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[18]), (char**)&(((uint64_t*)_in0)[9])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[19]), (uint32_t*)&(((uint32_t*)_in0)[20]))));
+   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 80), 0, (uint64_t*)&(((uint64_t*)_in1)[0]), (uint64_t*)&(((uint64_t*)_in1)[4]), (uint32_t*)&(((uint32_t*)_in1)[16]), (uint32_t*)&(((uint32_t*)_in1)[17]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[18]), (char**)&(((uint64_t*)_in1)[9])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[19]), (uint32_t*)&(((uint32_t*)_in1)[20]))));
+   _TRY(_nErr, _stub_pack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 160), ((char*)_primROut + 0), (uint64_t*)&(((uint64_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[4]), (uint32_t*)&(((uint32_t*)_rout2)[16]), (uint32_t*)&(((uint32_t*)_rout2)[17]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[18]), (char**)&(((uint64_t*)_rout2)[9])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[19]), (uint32_t*)&(((uint32_t*)_rout2)[20]))));
+   _QAIC_ASSERT(_nErr, (_numInH[0] + 0) <= 15);
+   _QAIC_ASSERT(_nErr, (_numROutH[0] + 0) <= 15);
+   _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _pra));
+   _TRY(_nErr, _stub_unpack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint64_t*)&(((uint64_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[4]), (uint32_t*)&(((uint32_t*)_rout2)[16]), (uint32_t*)&(((uint32_t*)_rout2)[17]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[18]), (char**)&(((uint64_t*)_rout2)[9])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[19]), (uint32_t*)&(((uint32_t*)_rout2)[20]))));
+   _QAIC_CATCH(_nErr) {}
+   _CATCH_FARF(_nErr) {
+      _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _mid, __func__);
+   }
+   _allocator_deinit(_al);
+   return _nErr;
+}
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_add)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
+   uint32_t _mid = 2;
+   return _stub_method(_handle, _mid, (uint64_t*)src0, (uint64_t*)src1, (uint64_t*)dst);
+}
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_mulmat)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
+   uint32_t _mid = 3;
+   return _stub_method(_handle, _mid, (uint64_t*)src0, (uint64_t*)src1, (uint64_t*)dst);
+}
+#ifdef __cplusplus
+}
+#endif
+#endif //_GGMLOP_STUB_H
diff --git a/ggml/src/ggml-qnn/kernels/libggmlop_skel.so b/ggml/src/ggml-qnn/kernels/libggmlop_skel.so
new file mode 100755
index 0000000000000000000000000000000000000000..9d4be24f3263907fa6c2bf83eacbf6fe17941dd9
GIT binary patch
literal 13896
zcmeHOdsJJ;nIBy}5RQWd78t+9!wz<h5E$b(xq1M?CN{OHm)M*NA;iN5q#{9a%Er`o
z+@!1cM17=b9J<Cyv)S!#m!A40*|X{9S}2a4b-FH4*Lf&Wu<2<|HkBKj9yi;T{e4&W
zVxc5WyW2GXbnMah{pS1Tn{OU>X6~KgP(@9pLZM)CP_ieOj7pK%Si)EjXel!=6)Rv1
zS=NkP&U17*V-(etpkr~l0lXB)ImRezkVJB@GcwW{2Bb)Gl@w2d%CR3jMOoGiUJf1Y
zO~zh#x3So0!atza*3J%(*Wo+h?lAg1n@WxAj3!z66QI;qf`PFmpy@1)-JVcJ26%lE
zUI9$?qWYwxrPhEhSrE^k!BW{q;DS{AAn1TAK~F<&K;w|SPL#xd$o{@093(8|C;jgN
zZa{u2k1(}!G>fq<5`GS}AeXTnpy_NLdlgg;WdAK0fD-;4q5=BU7o?w04Il-Qe<lIz
zfY&ErlJmgPrHt9YXRtY}64(TPpF+cte~rKcDU5Z9@)XvRl)ozp_atB{???iMn~cfh
zp`*Q%`TX4-?k?tVw7L77t)4DNb9--d`&#DnI537}Rab|{=Y~=<lG~hJ%^hx8(%I9|
z>GZQscc;(oPcVZ7m-nEfqaB0ppu^eG-rD7Ec67FPtu1Lf=y&@Z$mri7vdHD?b2?hu
zyP$TD+u`%~G|A?j&CN3F@xo8pby$@gakO{!dJec9J>BihvD@$I>43d%cWb-P@9uWA
zbUQoUP-*cvnw&nj?++OGg0tO4b@sJ;x~N(3AC|t@?exNm+f7M+&jAM#;c5Fv8)7J~
z9Zg~O`P|)N%b+1-OP$V+4v#CT#thG>3r{A|R@V$S>#0SzEtVpND#P%d2{eY}K=X(+
z)WxJl_+2Oo=YwWTJdKNViJu3WFhs;TXO1NMI6`Ay4oaB;otwZDE6#(U7{=^2#56>j
z2egQph%`qG8yL$2je?$*_}_v)EAiPmj6IjYhb5l!U6lBOWtdkb-UR-##NQA8hQ!-o
zM~6sd)qt*%_zj>GDSZHRO9IbJ{6nCWpLA+L?GoPzO13E73wl)IPlA3Ufqz2cp9Q5z
z`WHc;OW*~GzXlpk;Lk}s!$UkS@lj9}Fxk<89+h|+1Of0lGyo7zg3qOnMtFjFDg7zp
zsS^<XnRsdN3==O6kkiC-ah+$0mj={x#M7Wc5QwKvMF<lw4a#%GOM~?S@yfW)MdIbb
z7*Cq{<%GeL?A6E+cX@Cqn@`>U^+VJ1zWO5T%f7t&`gQQ^EbC)J_H~6Y5Y!3^9mlgi
zMS-%QRA76jAwTf~>s$O<_Vp9f|MNT(P6RaqmJQ)#?ak@wyX%F);37T}TqZ1ghzWnc
ziwfP(W39PZy+6(J*u+l$vEUZrv0$1dEx?4wL^<gs2GUOxs`oDujt5Dno$#wkcsvRJ
zI0>Ii!l7HZjqJUigfAsw_$!w$>c5nP!?$oN*?S`iU%rJ|QsB_$-$pws(9T>a=Q_3N
z0NS4k+;xUFX9BiAf@3!mQuiN5o2K(F@9({le^`Nj<0_gS)bmZ=+`7n6&ZWqZ&cd|M
zm|0&AZ|TV!jSOAm7d-oUGixx^M~0T8j~UPUGvS|B;B@TX>3OEc_5O9h?2ILY`@Ful
z6nx`FOMY&xKA;r*UX{?~ed1i*@vCpDdas(Tuaev#Sn`$j(g!CTTyM86608eHf}EAR
zUIzaCkx0<>SuT9Isd&>vNY6%GmtH!buU7>s?CL<-g7q1~#j4}X@<Pm9uL-QrEXg`k
zYdKzNPYJL)*XkA4v$6XtSv~4}D1%o_Wud(K5T`2vEf}$6%sDi@kE>>VhBb-;j`w>_
z!h&Z{h3k&9ou}^GbWYj8+3OxGraS^C`gql>?<LF&s{H$G`UjO^<@FU7WsK5vXDlmR
z<E$@_4+c%Z-Yb@z9{8^Q+|uERbF43y|E8GEPgrtZXWwC0GA2+DE|4#j1%+DcahqKg
zxUyhE$1GvgMH#r7IU&}C(vLt#F%pht@{ysYGi5=_cXi>FMKk%9Ph`*JTRO3f+Ca<)
zUCNh5`9kr0R!dnhAH1BG1HJ+*=MMot0W90#Y^?7HFxgoEJB=e{!OD6Tuv*K4IpEKV
zzAFL@_zJLw567zD{~6$uz%SOC;R_2WY+1ucfK#mD7z?Y1>+YT2mwh%eWU-e8-RH}K
zE>JV56?yZ)uLM6L)q??Zz)IM*)vE)W#(y*vhF#vS3A_baix3Q|EJDm$uLuahL+2<j
ze|%*~gK+}iDx2RMK>F73;E<AkA*R|s=&xzy8a7}0(U4+C*06P3S+M3m(FQiva1(gd
zOH@ZLKy^O{yaJf&X~md(0C*BuV;ey|)q$ezWx;0TeX-Vl+=g+f27Vm!aBU6rm4TOm
zKXU=~t{#cm?ehXxfVTmwD$d0!P}i>j--dR0q4uuh)%M#0j{~<rPI)Uargz>|7EA+H
zSB=EZ)^0oA3cUJd_*<V6xDEUj$S&0GJg(VsJ{ALh3NmW{YWtkPNKIMr`;fl~-{)>W
z9~*)E?||>BpBLC>PYcWqM}}U8EM@yku_MT5w`&8p!<XBw=VDfj<1M2<8XEZ9>3wMz
z&`-P2Ph;pOP&265PvBRApXn!H4p{D|m5KcX*^`O=1iT9EFZPo~6Bq&>7%dA@e<{#k
z9G;=33uRT*Uw$*!&|?2)#U0SMpkH#JZ>_^P9xn@q;6IONL=B#+5Ml%HyMX-ep8vaC
z*l2WenVBK<`}U>7`D;!YU$U{in70m9;Ca+Tk92g6e1_J<t=64cI;=%~S)Fc*bZFjS
zzzUmgnA!BhT$uIguwGd#gF!xYdMFRR7;3qIHGFy~*Ukkjf_}(s(c@Vg4CY8OTSza;
zs?7SKO33HhFdva^@&U35%qcYQ7~lspTXdO1WJrbiP>H!`Invrfmi&jq`Ull}@_$8`
z$9!29R>n9>)^);$kYz<K>P)=$ie<$&lwW7-GD_xCJFV4~rH5)Oj`Qa<0nRGKmQntz
zTn^C!p406Tf>qouS{LH}OdR6AMKTt(<d=ggY}NY%{b&!N+p9OT;cMOn&n`tek2N_{
zX!ITyWq$8`q-lAUF=FBhn3*+ZqmAVDTLM|Fg-cfnc~wG%Dx!Y>>hqHhb+6`a%ZhD6
zPQ!+Q?BQIV^XCc5;cULqTNz^Y<*@rv^K#cO6ym(R7<LO1=4HaVq<MJ}(i#)yWy1W2
z%*&Ln`|G?c&dc|I>C>H;2YAI_=jA_QUOs)#r#~-0<z)Y-c{$+Bm^CMd%(Lg@dm3wJ
z=3^f9WaGN2Eue|>uspx~f9GSHnbp%gOmmi2z*f(uifp-QXjp>s3_|ws4a{q|nFb9U
zuMe6q-z<RMMnvWN6VG39D0|Z|j|mlAWX?I(aK;AuilJc<?5yV(40Au5zxmohnm_$s
zc6Z}@8ygq;b#_BT?zMF%<ov&UyL&|x`|zvK6JVF@bC-X(S&zB!S%GB~U~W>>RPQgg
zur1#|!?p~YnhH&3{g&?w`Yk42zhxk#-|}PREz)}4=8zZl=--6-`3~gEtgYU!z}j~f
z@>2zhf_X@L%Qnz+_B?A~{KB3Ejm)oy-`Z+JLmJQObC8xQ3|e^5Y{+O`IElQMw>A~#
z@Zav)*!XY$4CK8W_3Gw8pX5@ReG07gD_~!-r+U9&%3X65`ACkrwGVT1AFUZA&#bK8
zzfeFOk;bt8_9gk&C;GOyXCcbZkbIjf`nI@7A^NtsXHk^->qNhD*W~agdNw!yyFVNG
zUX6M)Fdk;sSJ}UXD2x1H*gvqpIryIGvZ28y`sefVBK3Rgc((VXX~@|0*{rY*ZNDYT
zXFOG|wXf`B19`QIz`(E5H?M4C1Nqqd=IB>GlT*tA2JCBBX68Melac>Jtl#wtfg4iX
zz2H6l-RR?UgH|O!82s@>Nlq@_TZz|(EC#l>@4fKN=?0ZlK9kC4yx02>-g&z5j#CF(
z2AT~@??8CpeV9oB<zNru^y-$}c;}^?Xyc8F0<66aOt?tzV}1N`RXObi82W*MhdxvA
z)%!2soW7eA@Y&QdFiK-J+(-R0c#__wg*%zuZ8?m0XD)x0Sshrxt9vVft3pSD3jTo}
z72oQ$hmHn0yJl+98AF3sNNud<kDRCxj-E)ZNqt8*dg^-`t7>W$e=L|Id?lFr<<xhy
zc#l+!KQ*LltVoAn*zX**s)ZwH6ZJcc$Gbax)m}RFJu3U!p+o)j4!RNktl`7K(#w${
zdJcH->+l}h0=gdWpV}IJG7Ws$cw{K`E*85{n>Loghk~@f%dWIfrkdx*MhXo53#+s-
z1%G^W?cT^xHukOyt2Hr=9pi9EWT*)50E~ZRG>c#9=Y>#EZPEA|VPiS=T|WoS0nOsS
z>(}rj!4&L;SAs4BtpuG5+5)-|^hr<y=vh!D=t<<|#Po156*dImEa2!K{n!L_v`FLj
zESNeA|5YI>D;&&(Tu1#Nwhud`2fkUGJ+=t%s-(LV?Z6<9?#Uh#b?AB9%TH=}u0LPE
z9P}ECDbFm5@u6d*++NiU<-P?sbiC?D`k4hcu6_~uLi({NQP|{a)29}}2W=zkkFNgJ
z8w&g3(Mk&!(BI7(4xb;a;$U+?2;n_&V6?K{(6GT4sj5F4sT?pzf*UXv750Hq#<zG$
ze-Vw%@)SF(92ngMUFcXzmkFA8IoMaA@Aj>J=?#VD=%^0*wsVmxh54-m7z3kgpr1by
zv9d6GS7CW{G^bo)XTsr8CI|<Z;DiqU0QQXacVIrDJX99Pn;*!K(wGoEz(I%RAJQS&
z9hgUczwAQ!e$<trp4@omR5SQ2v?JTQc#6Bca7tCTXYyOC75(a;X2sM$)5oq}%pW8F
zgmy#2146{2y39|iUtuxzne15f)%-DPhv(W14Tr4}>q+<<vYZ@aQPyup`orz4;db<4
zI_f#P?3MCM;ZtM$o`KP?wy}nl)<_jM5~;%6({JX#Hu}ZY;n%g5mPrObm&4E0DlTTT
z91Y%1yj2rZK~L^~-MPpR<|$xpKet-h9~D&bewW4v^<Nav3J19!_0opbQ@rii;rA_z
z#x#v7{R)hq(B7xVqI}la5{&)DcuubJPmWE5O2=##)f*P{_a>C9#{Ocw7j0!vozf$6
zp$NXKDW+arcD~$zJlUbWlM6={#n_(u$%Xs47<XyGl;UUEF~uucvFNyA>}lks=S45I
zZ>}X0%)$2{)t<V^pJTqF_N8_{1-<vrqAlw(r?L>Qtq#8N$gijOP1pv5^!-S|Tke~K
zxf0{gnrk^6d=B%^1jhOtjQwM0B0<$z%cS~6E~XyQ#3pQu#+s4Fg;eiiZR)>LF&KOl
zW$myW!FPwjAmgp~)!7XVk6Q+<exz?+W+~V5&E5;(RW=^&-{QT5v1|<;32G{9Ce>%O
zu@uZ9<l7q9y?RzP_R}radM>2CuE9KWl`kDL;Tfy6YQlGz)CKbMXRiI)sEOAN7x2eU
ztP;L*;=XEa?B-i<Hbm>$m}X+>@Bshl=rZ`BzItr4nSXfn_20bNpvPF1^RuR<!#cjp
zyQ(}``_k0(KB{XP=Kf3YVQAS?<tnT_8qA#qnEU@5zB6Ilq++`Y?|b&iF#1VJbD7ZW
zRoQALqZT%1d2ukPepwTXh8f06+I0nd;CX%cm$0pc4IVaraalX|Wjw!YstpY%&qu60
zc<$z#4Y>lVr}A@`rI*h|nQ3S*Dz`;IJ6pC*&J$Wtj(t*dcJHKe``noNd|E7J-@MpF
zm>Zj@<i^UeZ{hboJjyM*t^DWpnz7Tun+>DO+RDwgfzj{0@+S5eSgWcc!S~Kag5#Bu
z;M1UwfZhX2YnqzYDSUg?&>FR^djB!_Ogy_FrD5IbM;cOQ0`>qW*=vJC^YH!hU3}Y|
zGs61j{nu*oJLh72^IZJeyz3`~#ly+pju&J7rL|7}ZoHDMV)I3-xok7rz*5*!oUhDJ
z;MXMZCicf@-GNi(a-4>g!>)9+*3#i@^|AN~oUhwe%$l8kC-Wcl;z+97Wn$vFShLU1
z=&X!R!Md7Qmz%ZC)YsA8+<|kgt#SgL3Ks(yr36?hlFrrIJH1BNrjioFYUz}ZR3w>L
z4&e{#o7Ne++{Gn^6(FV(!`7_^)2)0`H9j<78Rc6W&$m|2M`e7PL4m@*h99Dj;)jkk
zo&eQJPSM%cKuY|~iq36Sz>nZZ18RaEoo|sYoq0t$h7(=Jo<X`2^r2kQu9x`VOiz23
zfa(!pPCU~ahY@69k4b5LFro$Dfc$hcLuF?oCWfeivEL$Q=forVGYl*TIuoO_FggRH
zpg;+vLxIYoNM~9qlp)uVWPoC%b1mW-M>ipG=7mW9Q``WVmK;PF08jBD#9G8gL@%Nw
z7Ijo8n96=b_?I{03Cbjlybzp3J_=dLJR2*;cBQj3N+(<7k5;7>&(SU-{hAc<z=^?t
z-zd^Zmwc7|Q2=uQlukj!03_E*c~gOt<@_C@I%{d~aBnIqD*C_;i#tE^lzOG3)6-1P
zBJO6al%Grad5l|?ex-A`S!oiuhQacuM)ujBYOA59X3t#)d;Fl^VC`^rwHl-YZ^H)T
z2BWF$W7TTfyAHU!4aMt?CPP74OG|S}nQ3FmI(KPllXGp0v)SY>E^aoJt#fTKnTpqy
ztX)@Hy0K-|$D1X`$S9d@Ds>@SQ_1?`^=nI7)|EOpmacP`G;Jt#7q>V!wrp&6HLWdP
zzrnT2z^tCmPFzRWROIpci@xCUbQVdqbL_TlFDf&Z&{S9C_jo#dMfQq%bJflrjye&=
z>su5DP3>Jp(*1|R4%E<4SlH(DwH5mO2RqzbeFuHLhQdx~x2tWdqy^~m_Q<Gr{W?RT
zug%%*ZZ;IUaKi79L6yl+=rKr{9J|~dZd^jBa*#v48#XwQ@q?qs@Moxu(OBencDK6y
za<!<KA}X%RROD@U710%i-A0e0u;wpN+9zMiCtSiOXdXkY+1=th*txZOx6R_H+x5BK
zbrm*8WsSLNYe|xDcZKzHyDDzUx;%ycwr;nx*=H!Uz(kX$quqr(bZ2F2{HBSFXjFP;
zWOScJmLPwk2={ix%Q}X_4yfM+Lx0-nHFi3^co;lgxOcPl12ZEE^<iN83tPKAJ>Eb3
zU?fbU7^r{Hi-g_F$7o8^9gX}#cUQCI$84_(J8%hSmi#a9j1<wOB0Q0-GOx>n8%DU4
zMK_n6z0URyXH$neFE7uOmuJmO*o@^Bw685$l~>^Pcds$vXH{Ntf`+(ZlvUcxN@q*4
zS4@zJ*>C;KR*mo9K1Q1vyF7k3T{fhvhs@Y~unPk+PH~UQNYg1ZcDp;AB$7C9ho2e6
z%Th*>^}G9!Al|Mrc6-EauF>5Vzr!V(a(1@6z+nGNrl1oC%;@rax_tm~(&~X3{KTzd
zRp@kgAtml%M{lRm)!Nxicfy#_*GAX6oK4^kcE)KOf7rsCkfxk<i;?z1@m<+zNk<vq
zF^T&^N{a7tn7D6@W7_e>*YP-(b|uofuM~HC*tcMR8JE*866aOY9!@FlGGTAIm<;>z
z8&~qaPbuzNnRqXeG?ciPNTE;K*C?f(-oOk_7v+H&oFQV`uToIrz2>WNR{T}5Urj%f
zguk1F{}q`0rJ5<=?+7UEL-$DdC!n+^HB0yv(2Ky-1f(AY{RJ>5;kQAr0n?tD<kvuT
zs6W-5a0)hX2H=$v)&bjrY5z|0RlxNL@)BUV{m4p165az$?Wq&<r!g0B6u3^p2Y_k+
zn=Rol0n@&h_N5AxdJLHMv9g@rDQMqH`+brR0@Hp~)_)vW?jI`k2f+7B?N7J${sUO<
z4+S@$jRVVpe2gaHUnXGc&8Z~(+a#={hL!^9&q>1blknmsoC_=`=`cRlCgBQTdOrFQ
z>7H*5@Bkiz4&V%y!gf(+Y5dc@*B0Q!d%r&5#Cx$10w><v{Wfsoz2PT;6Yu%{066j9
z@bi=(&zDUse-3*ASTMxL7u{R_39unIzW=9ty*Ggq?*(V!1*ji#3OY{5@_^;%NynwL
zV&GR~Ioe}AupFp;JCpFe37E1yn1JI?qn!KGrin|lOa49}emh|D0_(z+V1JvZdAGm2
zy{pw{?(TLT^vzDA8?_{3#u_77i5N?6VTYror_1GV7!y`KS*;jMvRwMq5Vr?I4o8LE
zQMt=}SB1k;QB}Re;YduX*kMCL+|613`Iy&9_l9wSSW2J4K61Lpg|j0sE)&y77)gUz
zAFNyJcJ6Z2RPU}6%d!!Qm*c3a*=aG?ICfT6?yjhF)R`?c739vX3&s*JT@gk(bTM1L
zSRKFFj6`ZFMCmH>>`TVt7BXR%$Ud;Al<##*ca%Tm;`N7J@}|EWn0?(^?)1b<+3`<Q
zQdcEh+?IM}_O)rcd!8_iW?Og6x{LkMYT}*mc#}(w>=0Xgrv0(SLlet;$3N{(x|~b8
z5I_5pI;?*59rHia<4Myq?GiBU%=)_z*}VK+f>w6==0tk*{fXi*B26D;gY;;9pli64
zMtV{FP^9lta*$apk>WQN(xdNM6r<n?Qy_a3S4fo7={prg`c9RoM{9FYf*yU}qDbGj
zNRRdgWQ+7RB9adfi|fLGv^D^W!64B)L1jJpJBXMd$*HasY>2X+xUOJr8K=cp3I>U8
z1C{k?J&@M~Nz7!u_&Oc;oAxynX^$WWX$OZ7^>KStKiXeVq&mvMATdp#R9A7Dr$oeI
z8G}+DvUvcJiXlDPk5Ke}NV$EG%X+kKQKY?=9Ay9QMS`qH`wfb+jC5sR?@Q35b&VqR
zlN@Bd01{+-w11>1%Scz|g9&=HkD+KI1u4irJPJ(qOj7zd^!^EBOAaKX_&6dxA9A?2
zfU&1#07^O(o<S7bSVZ`{tyPTuMB<6adS}3M3F+F^@Da|_{tc{H4;owN5fkk_f)DBc
Jn2;db`(M!Mj(Pw9

literal 0
HcmV?d00001

diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 6f117680071e5..4675c5fcad307 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -16,6 +16,7 @@ QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direc
 QNN_SDK_INSTALL_PATH=/opt/qcom/aistack/qairt/
 QNN_SDK_VERSION=2.32.0.250228
 QNN_SDK_PATH=${QNN_SDK_INSTALL_PATH}/${QNN_SDK_VERSION}
+HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
 
 qnnparams=" -mg 2 -ngl 99 "
 
@@ -99,7 +100,7 @@ function check_and_download_ndk()
 
 function build_arm64
 {
-    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DGGML_QNN_SDK_PATH=${QNN_SDK_PATH}
+    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH}
     cd out/android
     make -j16
     show_pwd
@@ -178,6 +179,7 @@ function prepare_run_on_phone()
     fi
     adb push ./out/android/bin/${program} ${REMOTE_PATH}/
     adb shell chmod +x ${REMOTE_PATH}/${program}
+    adb push ggml/src/ggml-qnn/kernels/libggmlop_skel.so  ${REMOTE_PATH}/
 }
 
 function run_llamacli()
diff --git a/scripts/ggml-qnn.cfg b/scripts/ggml-qnn.cfg
index b1a697ae12ed9..9d47dba7a596a 100644
--- a/scripts/ggml-qnn.cfg
+++ b/scripts/ggml-qnn.cfg
@@ -1,7 +1,7 @@
 [general]
 #0: QNN-CPU backend
 #1: QNN-GPU backend
-#2: QNN-NPU(htp) backend
+#2: QNN-NPU backend
 #3: default ggml backend
 qnn_backend = 2
 
@@ -9,7 +9,7 @@ qnn_backend = 2
 print_qnn_internal_log = 0
 
 # enable/disable perf of op function
-enable_perf = 0
+enable_perf = 1
 
 # enable/disable print tensors info in op function
 print_tensors_info = 0
@@ -17,9 +17,10 @@ print_tensors_info = 0
 # enable/disable dump op info in handle_op
 dump_op_info = 0
 
-# 0: general approach,similar to ggml-sycl or ggml-cann
-# 1: mapping entire ggml cgraph to QNN graph
-inference_approach = 0
+# 0: general approach through QNN
+# 1: general approach through Hexagon cDSP
+# 2: special approach through QNN: mapping entire ggml cgraph to QNN graph
+inference_approach = 1
 
 [npu]
 hvx_threads = 4

From 4267a2ebdbfc590eee5b318e8c90b68ba90f53f3 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 23 Mar 2025 22:45:22 +0800
Subject: [PATCH 129/200] ggml-qnn: refine general approach through Hexagon
 cDSP

---
 ggml/src/ggml-qnn/CMakeLists.txt              |   2 +-
 ggml/src/ggml-qnn/ggml-qnn.cpp                | 268 ++++----
 .../{ggmlop_stub.c => ggmlop_ap_skel.c}       |  88 ++-
 .../kernels/{ggmlop.h => ggmlop_ap_skel.h}    |  23 +-
 ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c       | 413 +++++++++---
 ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c  | 596 ++++++++++++++++++
 ggml/src/ggml-qnn/kernels/libggmlop_skel.so   | Bin 13896 -> 13704 bytes
 7 files changed, 1116 insertions(+), 274 deletions(-)
 rename ggml/src/ggml-qnn/kernels/{ggmlop_stub.c => ggmlop_ap_skel.c} (70%)
 rename ggml/src/ggml-qnn/kernels/{ggmlop.h => ggmlop_ap_skel.h} (95%)
 create mode 100644 ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c

diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt
index c63faca10e842..d5a16ffd4e1a1 100644
--- a/ggml/src/ggml-qnn/CMakeLists.txt
+++ b/ggml/src/ggml-qnn/CMakeLists.txt
@@ -49,7 +49,7 @@ endif()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_QNN")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
 
-file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/*.c")
+file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/ggmlop_ap_skel.c")
 ggml_add_backend_library(ggml-qnn ${QNN_SOURCES})
 
 target_include_directories(ggml-qnn PRIVATE ${QNN_SDK_PATH}/include/QNN ${HEXAGON_SDK_PATH} ${CMAKE_CURRENT_LIST_DIR})
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 23fe675aa97f3..70bdc625fe37b 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -122,7 +122,7 @@
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 
-#include "kernels/ggmlop.h"
+#include "kernels/ggmlop_ap_skel.h"
 
 // =================================================================================================
 //  section-1: forward/prototype declaration, global vars, macros, data structures
@@ -132,7 +132,7 @@ struct qnn_parameter;
 struct ggml_backend_qnn_context;
 
 typedef void (* ggmlqnn_op_func_t)(ggml_backend_qnn_context * ctx, ggml_tensor * op);
-typedef int  (* notif_callback_fn)(void * context, int domain, int session, remote_rpc_status_flags_t status);
+typedef int  (* notify_callback_fn)(void * context, int domain, int session, remote_rpc_status_flags_t status);
 typedef int  (* ggmlhexagon_op_func_t)(remote_handle64 handle, const dsptensor * src0, const dsptensor * src1, dsptensor * dst);
 
 static void *           ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op);
@@ -288,10 +288,15 @@ using qnn_cgraph_node_t                         = std::tuple<std::string, qnn_te
 using qnn_cgraph_nodes_t                        = std::vector<qnn_cgraph_node_t>;
 using qnn_multinode_res_t                       = std::tuple<Qnn_GraphHandle_t, qnn_cgraph_nodes_t, qnn_ptensors_t, qnn_tensors_t, qnn_tensors_t>;
 
+enum qnn_index_type {
+    QNN_TENSOR_INDEX = 0,
+    QNN_OPCFG_INDEX  = 1,
+};
+
 enum qnn_profile_level {
     PROFILE_OFF     = 0,
     PROFILE_BASIC   = 1,
-    PROFILE_DETAIL  = 2
+    PROFILE_DETAIL  = 2,
 };
 
 //0: general approach through QNN
@@ -861,63 +866,34 @@ static const char * dlerror(void) {
 // =================================================================================================
 //  section-4: general helper function
 // =================================================================================================
-//TODO: merge the following 6 helper functions which used to ensure every QNN tensor/opcfg name is unique
-static void ggmlqnn_reset_tensoridx() {
+//ensure every QNN tensor/opcfg name is unique
+static void ggmlqnn_reset_idx() {
     g_qnntensor_idx = 0;
-}
-
-static void ggmlqnn_inc_tensoridx() {
-    g_qnntensor_idx++;
-}
-
-static int32_t ggmlqnn_get_tensoridx() {
-    return g_qnntensor_idx;
-}
-
-static void ggmlqnn_reset_opcfgidx() {
     g_qnnopcfg_idx = 0;
 }
 
-static void ggmlqnn_inc_opcfgidx() {
-    g_qnnopcfg_idx++;
-}
-
-static int32_t ggmlqnn_get_opcfgidx() {
-    return g_qnnopcfg_idx;
-}
-
-static void * ggmlqnn_mallocz_aligned(size_t size, size_t alignment) {
-    uint8_t * buffer    = NULL;
-    size_t * sp         = NULL;
-    buffer = static_cast<uint8_t *>(calloc(1, size + GGMLQNN_MEM_ADD(alignment)));
-    if (!buffer)
-        return NULL;
-    sp = (size_t *)buffer;
-    *sp = size;
-    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
-    buffer[-1] = buffer - (uint8_t *)sp;
-    return buffer;
-}
-
-static void * ggmlqnn_malloc_aligned(size_t size, size_t alignment) {
-    uint8_t * buffer = NULL;
-    size_t * sp = NULL;
-    buffer = static_cast<uint8_t *>(malloc(size + GGMLQNN_MEM_ADD(alignment)));
-    if (!buffer)
-        return NULL;
-    sp = (size_t *)buffer;
-    *sp = size;
-    buffer = (uint8_t *)(((uintptr_t) buffer + GGMLQNN_MEM_ADD(alignment)) & ~GGMLQNN_MEM_MASK(alignment));
-    buffer[-1] = buffer - (uint8_t *)sp;
-    return buffer;
+static void ggmlqnn_inc_idx(int idx_type) {
+    switch (idx_type) {
+        case QNN_TENSOR_INDEX:
+            g_qnntensor_idx++;
+            break;
+        case QNN_OPCFG_INDEX:
+            g_qnnopcfg_idx++;
+            break;
+        default:
+            break;
+    }
 }
 
-static void ggmqnn_free_aligned(void * ptr) {
-    uint8_t * old = (uint8_t *)ptr;
-    if (!old)
-        return;
-    old -= old[-1];
-    free(old);
+static int32_t ggmlqnn_get_idx(int idx_type) {
+    switch (idx_type) {
+        case QNN_TENSOR_INDEX:
+            return g_qnntensor_idx;
+        case QNN_OPCFG_INDEX:
+            return g_qnnopcfg_idx;
+        default:
+            break;
+    }
 }
 
 static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
@@ -994,25 +970,6 @@ static char * ggmlqnn_strndup(const char * source, size_t maxlen) {
 #endif
 }
 
-static void * ggmlqnn_host_malloc(size_t buffer_size, size_t page_size) {
-    void * data = nullptr;
-#if defined(__ANDROID__) || defined(__linux__)
-    int result = posix_memalign((void **)&data, page_size, buffer_size);
-    if (result != 0) {
-        GGMLQNN_LOG_WARN("%s: error: posix_memalign failed\n", __func__);
-        return nullptr;
-    }
-#else
-    //GGMLQNN_LOG_DEBUG("buffer_size %d, page_size %d\n", buffer_size, page_size);
-    data = ggmlqnn_malloc_aligned(buffer_size, page_size);
-    if (nullptr == data) {
-        GGMLQNN_LOG_WARN("%s: error: host_malloc failed\n", __func__);
-    }
-#endif
-
-    return data;
-}
-
 static void ggmlqnn_get_timestring(char * p_currenttime) {
     time_t n_seconds    = 0;
     struct tm * p_tm    = nullptr;
@@ -1027,6 +984,37 @@ static void ggmlqnn_get_timestring(char * p_currenttime) {
              p_tm->tm_hour, p_tm->tm_min, p_tm->tm_sec);
 }
 
+//fix some tricky memory issue
+typedef int (*pfn_mallopt)(int, int);
+typedef int (*pfn_android_mallopt)(int, void *, size_t);
+static void ggmlqnn_disable_android_tags(int disable) {
+    if (0 == disable)
+        return;
+
+    void * lib_handle = dlopen("libc.so", RTLD_LAZY);
+    if (nullptr != lib_handle) {
+        int api_level = android_get_device_api_level();
+        GGMLQNN_LOG_INFO("device_api_level=%d", api_level);
+        if (api_level >= 31) { //ANDROID 12
+            pfn_mallopt mallopt = reinterpret_cast<pfn_mallopt>(dlsym(lib_handle, "mallopt"));
+            if (mallopt) {
+                mallopt(M_BIONIC_SET_HEAP_TAGGING_LEVEL, M_HEAP_TAGGING_LEVEL_NONE);
+            }
+            return;
+        } else if (api_level >= 30) { //ANDROID 11
+            /* android_get_device_api_level() < 31 */
+            pfn_android_mallopt android_mallopt = reinterpret_cast<pfn_android_mallopt>(dlsym(
+                    lib_handle, "android_mallopt"));
+            if (android_mallopt) {
+                int android_malloc_tag_level = 0;
+                int tmp = 0;
+                android_mallopt(8, &tmp, sizeof(tmp));
+            }
+        }
+        dlclose(lib_handle);
+    }
+}
+
 // =================================================================================================
 //  section-5: QNN helper function
 // =================================================================================================
@@ -1359,12 +1347,12 @@ static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * p
 
     //ensure the opcfg name is unique
     if (nullptr == name) {
-        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%-8d", ggmlqnn_get_opcfgidx());
+        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%-8d", ggmlqnn_get_idx(QNN_OPCFG_INDEX));
     } else {
-        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%s_%-8d", name, ggmlqnn_get_opcfgidx());
+        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%s_%-8d", name, ggmlqnn_get_idx(QNN_OPCFG_INDEX));
     }
     GGMLQNN_LOG_DEBUG("create qnn opconfig %s", opcfg_name);
-    ggmlqnn_inc_opcfgidx();
+    ggmlqnn_inc_idx(QNN_OPCFG_INDEX);
 
     Qnn_OpConfigV1_t v1 = {opcfg_name, package, type,
                            num_params, params,
@@ -1860,7 +1848,7 @@ static int ggmlhexagon_get_hvx_support_info(int domain, uint32_t * capability, u
     return hexagon_error;
 }
 
-static int ggmlhexagon_request_status_notifications(int domain_id, void * context, notif_callback_fn call_back_fn) {
+static int ggmlhexagon_request_status_notifications(int domain_id, void * context, notify_callback_fn call_back_fn) {
     int hexagon_error = AEE_SUCCESS;
     struct remote_rpc_notif_register notif;
     bool status_notification_support;
@@ -2019,17 +2007,8 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
             goto bail;
         }
         uri = my_domain->uri;
-    } else {
-        domain_info = &domains_info[domain_id];
-        uri = (char *)malloc(MAX_DOMAIN_NAMELEN);
-        if (nullptr == uri) {
-            hexagon_error = AEE_ENOMEMORY;
-            GGMLQNN_LOG_DEBUG("unable to allocated memory for uri of size: %d", MAX_DOMAIN_NAMELEN);
-            goto bail;
-        }
-        snprintf(uri, MAX_DOMAIN_NAMELEN, "%s%s", "&_dom=", domain_info->name);
     }
-    GGMLQNN_LOG_INFO("\ndomain uri=%s\n", uri);
+    GGMLQNN_LOG_INFO("domain uri=%s\n", uri);
 
     if (1 == unsignedpd_flag) {
         is_unsignedpd_enabled = ggmlhexagon_is_unsignedpd_supported(domain_id);
@@ -2078,7 +2057,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
         ggmlhexagon_set_clocks(ctx->ggmlop_handle, HAP_DCVS_V2_DUTY_CYCLE_MODE, 40, 1);
         ggmlhexagon_set_rpc_latency(domain_id, RPC_POLL_QOS, 1000);
     } else {
-        GGMLQNN_LOG_WARN("error 0x%x: failed to compute on domain %d(%s)", hexagon_error, domain_id,
+        GGMLQNN_LOG_INFO("error 0x%x: failed to open domain %d(%s)", hexagon_error, domain_id,
                          ggmlhexagon_get_dsp_name(domain_id));
         goto bail;
     }
@@ -2089,10 +2068,6 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
         free(ggmlop_domain_uri);
     }
 
-    if (uri) {
-        free(uri);
-    }
-
     if (ctx->rpc_mempool) {
         rpcmem_free(ctx->rpc_mempool);
         ctx->rpc_mempool = nullptr;
@@ -2153,27 +2128,54 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
     }
 
     if ((GGML_OP_MUL_MAT == op->op) && (src0_type != GGML_TYPE_F32)) {
-        dsptensor_0.data = static_cast<float *>(wdata);
-        dsptensor_0.dataLen = ctx->desired_size;
+        dsptensor_0.data = wdata;
+        dsptensor_0.data_len = ctx->desired_size;
     } else {
-        dsptensor_0.data = static_cast<float *>(src0->data);
-        dsptensor_0.dataLen = ggml_nbytes(src0);
-    }
-    dsptensor_1.data = static_cast<float *>(src1->data);
-    dsptensor_2.data = static_cast<float *>(dst->data);
-    dsptensor_0.type = GGML_TYPE_F32;
-    dsptensor_1.type = GGML_TYPE_F32;
-    dsptensor_2.type = GGML_TYPE_F32;
+        dsptensor_0.data = src0->data;
+        dsptensor_0.data_len= ggml_nbytes(src0);
+    }
+
+    dsptensor_1.data = src1->data;
+    dsptensor_2.data = dst->data;
+
     dsptensor_0.ne[0] = src0->ne[0];
     dsptensor_0.ne[1] = src0->ne[1];
     dsptensor_0.ne[2] = src0->ne[2];
     dsptensor_0.ne[3] = src0->ne[3];
+
     dsptensor_0.nb[0] = src0->nb[0];
     dsptensor_0.nb[1] = src0->nb[1];
     dsptensor_0.nb[2] = src0->nb[2];
     dsptensor_0.nb[3] = src0->nb[3];
-    dsptensor_1.dataLen = ggml_nbytes(src1);
-    dsptensor_2.dataLen = ggml_nbytes(dst);
+
+    dsptensor_1.ne[0] = src1->ne[0];
+    dsptensor_1.ne[1] = src1->ne[1];
+    dsptensor_1.ne[2] = src1->ne[2];
+    dsptensor_1.ne[3] = src1->ne[3];
+
+    dsptensor_1.nb[0] = src1->nb[0];
+    dsptensor_1.nb[1] = src1->nb[1];
+    dsptensor_1.nb[2] = src1->nb[2];
+    dsptensor_1.nb[3] = src1->nb[3];
+
+    dsptensor_2.ne[0] = dst->ne[0];
+    dsptensor_2.ne[1] = dst->ne[1];
+    dsptensor_2.ne[2] = dst->ne[2];
+    dsptensor_2.ne[3] = dst->ne[3];
+
+    dsptensor_2.nb[0] = dst->nb[0];
+    dsptensor_2.nb[1] = dst->nb[1];
+    dsptensor_2.nb[2] = dst->nb[2];
+    dsptensor_2.nb[3] = dst->nb[3];
+
+    dsptensor_0.data_len = ggml_nbytes(src0);
+    dsptensor_1.data_len = ggml_nbytes(src1);
+    dsptensor_2.data_len = ggml_nbytes(dst);
+
+    dsptensor_0.type = src0->type;
+    dsptensor_1.type = src1->type;
+    dsptensor_2.type = dst->type;
+
     hexagon_error = op_func(ctx->ggmlop_handle, &dsptensor_0, &dsptensor_1, &dsptensor_2);
     if (AEE_SUCCESS != hexagon_error) {
         GGMLQNN_LOG_WARN("ggmlop computation fail on cdsp");
@@ -3667,8 +3669,7 @@ int qnn_instance::qnn_finalize() {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
     GGMLQNN_LOG_DEBUG("enter %s\n", __func__);
-    ggmlqnn_reset_tensoridx();
-    ggmlqnn_reset_opcfgidx();
+    ggmlqnn_reset_idx();
 
     free_rpcmem();
     unregister_rpcmem();
@@ -4192,6 +4193,8 @@ static void ggmlqnn_load_cfg() {
     memset(time_string, 0, GGML_QNN_TMPBUF_LEN);
     ggmlqnn_get_timestring(time_string);
     GGMLQNN_LOG_DEBUG("program running start time:%s", time_string);
+    ggmlqnn_disable_android_tags(1);
+
     std::string cfg_filename = std::string(g_qnn_params.qnn_runtimelib_path) + std::string(g_qnn_params.qnn_cfgfilename);
     GGMLQNN_LOG_INFO("load ggml-qnn config from %s", cfg_filename.c_str());
     qnn_cfg qnncfg_instance;
@@ -4238,12 +4241,12 @@ static Qnn_Tensor_t * ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn
 
     //ensure the tensor name is unique
     if (nullptr == name) {
-        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", ggmlqnn_get_tensoridx());
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", ggmlqnn_get_idx(QNN_TENSOR_INDEX));
     } else {
-        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, ggmlqnn_get_tensoridx());
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, ggmlqnn_get_idx(QNN_TENSOR_INDEX));
     }
     GGMLQNN_LOG_DEBUG("init_tensor %s", tensor_name);
-    ggmlqnn_inc_tensoridx();
+    ggmlqnn_inc_idx(QNN_TENSOR_INDEX);
 
     uint32_t reverse_dims[GGML_MAX_DIMS]    = {};
     uint32_t transpose_dims[GGML_MAX_DIMS]  = {};
@@ -4362,20 +4365,46 @@ static bool ggmlqnn_same_types(const ggml_backend_qnn_context * ctx, const ggml_
             return false;
         }
     }
+
     if (src0->type != GGML_TYPE_F32)
         return false;
+
     return true;
 }
 
+static bool ggmlhexagon_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * op_tensor) {
+    struct ggml_tensor * src0 = op_tensor->src[0];
+    struct ggml_tensor * src1 = op_tensor->src[1];
+
+    const int64_t ne00  = op_tensor->src[0]->ne[0];
+    uint32_t src0_rank  = ggml_n_dims(src0);
+    uint32_t src1_rank  = 0;
+    if (nullptr != src1) {
+        src1_rank = ggml_n_dims(src1);
+    }
+
+    //FIXME: mulmat on cDSP doesn't work as expected
+    if (op_tensor->op != GGML_OP_ADD)
+        return false;
+
+    //ggmlqnn_dump_op_info(op_tensor);
+    if (!ggml_are_same_shape(src0, src1)) {
+        return false;
+    }
+
+    if (ne00 < 32)
+        return false;
+
+    return ggmlqnn_same_types(ctx, op_tensor);
+}
+
 static bool ggmlqnn_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * op_tensor) {
     if (op_tensor->op == GGML_OP_NONE) {
         return true;
     }
 
     if (DIRECT_USE_CDSP == g_qnn_params.inference_approach) {
-        //FIXME: mulmat on cDSP doesn't work as expected
-        if (op_tensor->op != GGML_OP_ADD)
-            return false;
+        return ggmlhexagon_can_handle_op(ctx, op_tensor);
     }
 
     if (!ggmlqnn_k_op_caps[ggmlqnn_get_op_index(op_tensor)].supported) {
@@ -4384,25 +4413,17 @@ static bool ggmlqnn_can_handle_op(const ggml_backend_qnn_context * ctx, const st
 
     struct ggml_tensor * src0 = op_tensor->src[0];
     struct ggml_tensor * src1 = op_tensor->src[1];
-
     const int64_t ne00  = op_tensor->src[0]->ne[0];
-    const int64_t ne01  = op_tensor->src[0]->ne[1];
-    const int64_t ne0   = op_tensor->ne[0];
-    const int64_t ne1   = op_tensor->ne[1];
-
     uint32_t src0_rank  = ggml_n_dims(src0);
     uint32_t src1_rank  = 0;
     if (nullptr != src1) {
         src1_rank = ggml_n_dims(src1);
     }
-    GGML_UNUSED(ne01);
-    GGML_UNUSED(ne0);
-    GGML_UNUSED(ne1);
+
     switch (op_tensor->op) {
         case GGML_OP_ADD:
         case GGML_OP_SUB:
         {
-            //ggmlqnn_dump_op_info(op_tensor);
             if (!ggml_are_same_shape(src0, src1)) {
                 return false;
             }
@@ -4415,7 +4436,6 @@ static bool ggmlqnn_can_handle_op(const ggml_backend_qnn_context * ctx, const st
 
         case GGML_OP_DIV:
         case GGML_OP_MUL: {
-            //ggmlqnn_dump_op_info(op_tensor);
             if (ctx->device == QNN_BACKEND_NPU)
                 return false;
 
@@ -4597,7 +4617,7 @@ static bool ggmlqnn_compute_forward(ggml_backend_t backend, struct ggml_tensor *
 struct ggml_backend_qnn_buffer_context {
     ~ggml_backend_qnn_buffer_context() {
         if (buffer) {
-            free(buffer);
+            ggml_aligned_free(buffer, 0);
         }
 
         for (auto * sub_buffer : sub_buffers) {
@@ -4709,7 +4729,7 @@ static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(
     if ((size_aligned % size_page) != 0) {
         size_aligned += (size_page - (size_aligned % size_page));
     }
-    ctx->buffer         = ggmlqnn_host_malloc(size_aligned, size_page);
+    ctx->buffer         = ggml_aligned_malloc(size_aligned);
     ctx->buffer_size    = size_aligned;
     if (nullptr == ctx->buffer) {
         GGMLQNN_LOG_WARN("%s: failed to allocate %d MiB\n", __func__, size / (1 << 20));
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_stub.c b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c
similarity index 70%
rename from ggml/src/ggml-qnn/kernels/ggmlop_stub.c
rename to ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c
index 6313348d7ea2d..1e1ce6488d25e 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_stub.c
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c
@@ -1,10 +1,3 @@
-#ifndef _GGMLOP_STUB_H
-#define _GGMLOP_STUB_H
-/// @file ggmlop.idl
-///
-//qidl copyright
-//qidl nested=false
-#include "ggmlop.h"
 #include <string.h>
 #ifndef _WIN32
 #include "HAP_farf.h"
@@ -15,6 +8,7 @@
 
 #include <stdlib.h>
 #include <stdint.h>
+#include "ggmlop_ap_skel.h"
 
 typedef struct _heap _heap;
 struct _heap {
@@ -277,16 +271,16 @@ struct Interface {
 #endif
 
 static const Type types[5];
-static const Type* const typeArrays[5] = {&(types[0]),&(types[0]),&(types[2]),&(types[2]),&(types[3])};
-static const StructType structTypes[1] = {{0x5,&(typeArrays[0]),0x50,0x4,0x48,0x8,0x4,0x8}};
-static const Type types[5] = {{0x20,{{(const uintptr_t)&(types[1]),(const uintptr_t)0x4}}, 8,0x8},{0x8,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x8},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[4]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
-static const Parameter parameters[5] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x50,0x58),{{(const uintptr_t)&(structTypes[0]),0}}, 22,0x8,0,0},{SLIM_IFPTR32(0x50,0x58),{{(const uintptr_t)&(structTypes[0]),0}}, 22,0x8,3,0}};
+static const Type* const typeArrays[6] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[0]),&(types[3])};
+static const StructType structTypes[1] = {{0x6,&(typeArrays[0]),0x58,0x4,0x50,0x8,0x4,0x8}};
+static const Type types[5] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x20,{{(const uintptr_t)&(types[2]),(const uintptr_t)0x4}}, 8,0x8},{0x8,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x8},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[4]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
+static const Parameter parameters[5] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x58,0x60),{{(const uintptr_t)&(structTypes[0]),0}}, 22,0x8,0,0},{SLIM_IFPTR32(0x58,0x60),{{(const uintptr_t)&(structTypes[0]),0}}, 22,0x8,3,0}};
 static const Parameter* const parameterArrays[6] = {(&(parameters[3])),(&(parameters[3])),(&(parameters[4])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))};
-static const Method methods[3] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[3])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[5])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0xa4,0x48,3,3,(&(parameterArrays[0])),0x8,0x8}};
+static const Method methods[3] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[3])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[5])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0xb4,0x50,3,3,(&(parameterArrays[0])),0x8,0x8}};
 static const Method* const methodArrays[4] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[2])};
-static const char strings[65] = "mulmat\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0add\0uri\0nb\0ne\0h\0";
-static const uint16_t methodStrings[43] = {0,34,59,56,7,29,24,19,59,56,7,29,24,44,59,56,7,29,24,48,34,59,56,7,29,24,19,59,56,7,29,24,44,59,56,7,29,24,39,52,62,13,62};
-static const uint16_t methodStringsArrays[4] = {38,41,19,0};
+static const char strings[68] = "mulmat\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0add\0uri\0op\0nb\0ne\0h\0";
+static const uint16_t methodStrings[49] = {0,34,29,62,59,56,7,24,19,29,62,59,56,7,24,44,29,62,59,56,7,24,48,34,29,62,59,56,7,24,19,29,62,59,56,7,24,44,29,62,59,56,7,24,39,52,65,13,65};
+static const uint16_t methodStringsArrays[4] = {44,47,22,0};
 __QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {4,&(methodArrays[0]),0,0,&(methodStringsArrays [0]),methodStrings,strings};
 #endif //_GGMLOP_SLIM_H
 
@@ -300,19 +294,20 @@ __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_open)(const char* uri, remote_handle64
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_close)(remote_handle64 h) __QAIC_STUB_ATTRIBUTE {
    return __QAIC_REMOTE(remote_handle64_close)(h);
 }
-static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint64_t _rout0[4], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[1], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED char* _rout4[1], _ATTRIBUTE_UNUSED uint32_t _rout4Len[1]) {
+static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint64_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
    int _nErr = 0;
    remote_arg* _praROutPostStart = _praROutPost;
    remote_arg** _ppraROutPostStart = _ppraROutPost;
    _ppraROutPost = &_praROutPost;
-   _COPY(_rout0, 0, _primROut, 0, 32);
-   _COPY(_rout1, 0, _primROut, 32, 32);
-   _COPY(_rout2, 0, _primROut, 64, 4);
-   _COPY(_rout3, 0, _primROut, 68, 4);
+   _COPY(_rout0, 0, _primROut, 0, 4);
+   _COPY(_rout1, 0, _primROut, 8, 32);
+   _COPY(_rout2, 0, _primROut, 40, 32);
+   _COPY(_rout3, 0, _primROut, 72, 4);
+   _COPY(_rout4, 0, _primROut, 76, 4);
    _ppraROutPostStart[0] += (_praROutPost - _praROutPostStart) +1;
    return _nErr;
 }
-static __inline int _stub_pack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint64_t _rout0[4], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[1], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED char* _rout4[1], _ATTRIBUTE_UNUSED uint32_t _rout4Len[1]) {
+static __inline int _stub_pack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint64_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
    int _nErr = 0;
    remote_arg* _praInStart = _praIn;
    remote_arg** _ppraInStart = _ppraIn;
@@ -320,14 +315,14 @@ static __inline int _stub_pack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNU
    remote_arg** _ppraROutStart = _ppraROut;
    _ppraIn = &_praIn;
    _ppraROut = &_praROut;
-   _COPY(_primIn, 0, _rout4Len, 0, 4);
-   _praROut[0].buf.pv = _rout4[0];
-   _praROut[0].buf.nLen = (4 * _rout4Len[0]);
+   _COPY(_primIn, 0, _rout5Len, 0, 4);
+   _praROut[0].buf.pv = _rout5[0];
+   _praROut[0].buf.nLen = (4 * _rout5Len[0]);
    _ppraInStart[0] += (_praIn - _praInStart) + 0;
    _ppraROutStart[0] += (_praROut - _praROutStart) +1;
    return _nErr;
 }
-static __inline int _stub_pack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint64_t _in0[4], _ATTRIBUTE_UNUSED uint64_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[1], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED char* _in4[1], _ATTRIBUTE_UNUSED uint32_t _in4Len[1]) {
+static __inline int _stub_pack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint64_t _in1[4], _ATTRIBUTE_UNUSED uint64_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[1], _ATTRIBUTE_UNUSED char* _in5[1], _ATTRIBUTE_UNUSED uint32_t _in5Len[1]) {
    int _nErr = 0;
    remote_arg* _praInStart = _praIn;
    remote_arg** _ppraInStart = _ppraIn;
@@ -335,38 +330,39 @@ static __inline int _stub_pack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_U
    remote_arg** _ppraROutStart = _ppraROut;
    _ppraIn = &_praIn;
    _ppraROut = &_praROut;
-   _COPY(_primIn, 0, _in0, 0, 32);
-   _COPY(_primIn, 32, _in1, 0, 32);
-   _COPY(_primIn, 64, _in2, 0, 4);
-   _COPY(_primIn, 68, _in3, 0, 4);
-   _COPY(_primIn, 72, _in4Len, 0, 4);
-   _praIn[0].buf.pv = (void*) _in4[0];
-   _praIn[0].buf.nLen = (4 * _in4Len[0]);
+   _COPY(_primIn, 0, _in0, 0, 4);
+   _COPY(_primIn, 8, _in1, 0, 32);
+   _COPY(_primIn, 40, _in2, 0, 32);
+   _COPY(_primIn, 72, _in3, 0, 4);
+   _COPY(_primIn, 76, _in4, 0, 4);
+   _COPY(_primIn, 80, _in5Len, 0, 4);
+   _praIn[0].buf.pv = (void*) _in5[0];
+   _praIn[0].buf.nLen = (4 * _in5Len[0]);
    _ppraInStart[0] += (_praIn - _praInStart) + 1;
    _ppraROutStart[0] += (_praROut - _praROutStart) +0;
    return _nErr;
 }
-static __inline void _count(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint64_t _rout0[4], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[1], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED char* _rout4[1], _ATTRIBUTE_UNUSED uint32_t _rout4Len[1]) {
+static __inline void _count(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint64_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
    _numIn[0] += 0;
    _numROut[0] += 1;
    _numInH[0] += 0;
    _numROutH[0] += 0;
 }
-static __inline void _count_1(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint64_t _in0[4], _ATTRIBUTE_UNUSED uint64_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[1], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED char* _in4[1], _ATTRIBUTE_UNUSED uint32_t _in4Len[1]) {
+static __inline void _count_1(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint64_t _in1[4], _ATTRIBUTE_UNUSED uint64_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[1], _ATTRIBUTE_UNUSED char* _in5[1], _ATTRIBUTE_UNUSED uint32_t _in5Len[1]) {
    _numIn[0] += 1;
    _numROut[0] += 0;
    _numInH[0] += 0;
    _numROutH[0] += 0;
 }
-static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint64_t _in0[SLIM_IFPTR32(10, 11)], uint64_t _in1[SLIM_IFPTR32(10, 11)], uint64_t _rout2[SLIM_IFPTR32(10, 11)]) {
+static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint64_t _in0[SLIM_IFPTR32(11, 12)], uint64_t _in1[SLIM_IFPTR32(11, 12)], uint64_t _rout2[SLIM_IFPTR32(11, 12)]) {
    remote_arg* _pra = 0;
    int _numIn[1] = {0};
    int _numROut[1] = {0};
    int _numInH[1] = {0};
    int _numROutH[1] = {0};
    _allocator _al[1] = {{0}};
-   uint64_t _primIn[21]= {0};
-   uint64_t _primROut[9]= {0};
+   uint64_t _primIn[23]= {0};
+   uint64_t _primROut[10]= {0};
    remote_arg* _praIn = 0;
    remote_arg* _praROut = 0;
    remote_arg* _praROutPost = 0;
@@ -382,9 +378,9 @@ static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint64_
    _numROut[0] = 0;
    _numInH[0] = 0;
    _numROutH[0] = 0;
-   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint64_t*)&(((uint64_t*)_in0)[0]), (uint64_t*)&(((uint64_t*)_in0)[4]), (uint32_t*)&(((uint32_t*)_in0)[16]), (uint32_t*)&(((uint32_t*)_in0)[17]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[18]), (char**)&(((uint64_t*)_in0)[9])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[19]), (uint32_t*)&(((uint32_t*)_in0)[20])));
-   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint64_t*)&(((uint64_t*)_in1)[0]), (uint64_t*)&(((uint64_t*)_in1)[4]), (uint32_t*)&(((uint32_t*)_in1)[16]), (uint32_t*)&(((uint32_t*)_in1)[17]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[18]), (char**)&(((uint64_t*)_in1)[9])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[19]), (uint32_t*)&(((uint32_t*)_in1)[20])));
-   _count(_numIn, _numROut, _numInH, _numROutH, (uint64_t*)&(((uint64_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[4]), (uint32_t*)&(((uint32_t*)_rout2)[16]), (uint32_t*)&(((uint32_t*)_rout2)[17]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[18]), (char**)&(((uint64_t*)_rout2)[9])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[19]), (uint32_t*)&(((uint32_t*)_rout2)[20])));
+   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint64_t*)&(((uint64_t*)_in0)[1]), (uint64_t*)&(((uint64_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[18]), (uint32_t*)&(((uint32_t*)_in0)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[20]), (char**)&(((uint64_t*)_in0)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[21]), (uint32_t*)&(((uint32_t*)_in0)[22])));
+   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint64_t*)&(((uint64_t*)_in1)[1]), (uint64_t*)&(((uint64_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[18]), (uint32_t*)&(((uint32_t*)_in1)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[20]), (char**)&(((uint64_t*)_in1)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[21]), (uint32_t*)&(((uint32_t*)_in1)[22])));
+   _count(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[1]), (uint64_t*)&(((uint64_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[18]), (uint32_t*)&(((uint32_t*)_rout2)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[20]), (char**)&(((uint64_t*)_rout2)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[21]), (uint32_t*)&(((uint32_t*)_rout2)[22])));
    if(_numIn[0]>=255){
           _QAIC_FARF(RUNTIME_ERROR, "ERROR: Unsupported number of input buffers\n");
           return AEE_EUNSUPPORTED;
@@ -409,13 +405,13 @@ static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint64_
    }
    if(_praHROut == 0)
       (_praHROut = _praHIn + _numInH[0] + 0);
-   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint64_t*)&(((uint64_t*)_in0)[0]), (uint64_t*)&(((uint64_t*)_in0)[4]), (uint32_t*)&(((uint32_t*)_in0)[16]), (uint32_t*)&(((uint32_t*)_in0)[17]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[18]), (char**)&(((uint64_t*)_in0)[9])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[19]), (uint32_t*)&(((uint32_t*)_in0)[20]))));
-   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 80), 0, (uint64_t*)&(((uint64_t*)_in1)[0]), (uint64_t*)&(((uint64_t*)_in1)[4]), (uint32_t*)&(((uint32_t*)_in1)[16]), (uint32_t*)&(((uint32_t*)_in1)[17]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[18]), (char**)&(((uint64_t*)_in1)[9])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[19]), (uint32_t*)&(((uint32_t*)_in1)[20]))));
-   _TRY(_nErr, _stub_pack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 160), ((char*)_primROut + 0), (uint64_t*)&(((uint64_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[4]), (uint32_t*)&(((uint32_t*)_rout2)[16]), (uint32_t*)&(((uint32_t*)_rout2)[17]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[18]), (char**)&(((uint64_t*)_rout2)[9])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[19]), (uint32_t*)&(((uint32_t*)_rout2)[20]))));
+   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint64_t*)&(((uint64_t*)_in0)[1]), (uint64_t*)&(((uint64_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[18]), (uint32_t*)&(((uint32_t*)_in0)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[20]), (char**)&(((uint64_t*)_in0)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[21]), (uint32_t*)&(((uint32_t*)_in0)[22]))));
+   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 88), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint64_t*)&(((uint64_t*)_in1)[1]), (uint64_t*)&(((uint64_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[18]), (uint32_t*)&(((uint32_t*)_in1)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[20]), (char**)&(((uint64_t*)_in1)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[21]), (uint32_t*)&(((uint32_t*)_in1)[22]))));
+   _TRY(_nErr, _stub_pack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 176), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[1]), (uint64_t*)&(((uint64_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[18]), (uint32_t*)&(((uint32_t*)_rout2)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[20]), (char**)&(((uint64_t*)_rout2)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[21]), (uint32_t*)&(((uint32_t*)_rout2)[22]))));
    _QAIC_ASSERT(_nErr, (_numInH[0] + 0) <= 15);
    _QAIC_ASSERT(_nErr, (_numROutH[0] + 0) <= 15);
    _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _pra));
-   _TRY(_nErr, _stub_unpack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint64_t*)&(((uint64_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[4]), (uint32_t*)&(((uint32_t*)_rout2)[16]), (uint32_t*)&(((uint32_t*)_rout2)[17]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[18]), (char**)&(((uint64_t*)_rout2)[9])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[19]), (uint32_t*)&(((uint32_t*)_rout2)[20]))));
+   _TRY(_nErr, _stub_unpack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[1]), (uint64_t*)&(((uint64_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[18]), (uint32_t*)&(((uint32_t*)_rout2)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[20]), (char**)&(((uint64_t*)_rout2)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[21]), (uint32_t*)&(((uint32_t*)_rout2)[22]))));
    _QAIC_CATCH(_nErr) {}
    _CATCH_FARF(_nErr) {
       _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _mid, __func__);
@@ -431,7 +427,3 @@ __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_mulmat)(remote_handle64 _handle, const
    uint32_t _mid = 3;
    return _stub_method(_handle, _mid, (uint64_t*)src0, (uint64_t*)src1, (uint64_t*)dst);
 }
-#ifdef __cplusplus
-}
-#endif
-#endif //_GGMLOP_STUB_H
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop.h b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h
similarity index 95%
rename from ggml/src/ggml-qnn/kernels/ggmlop.h
rename to ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h
index b45070c20001b..0301f8f78f8d2 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop.h
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h
@@ -1,14 +1,12 @@
-#ifndef _GGMLOP_H
-#define _GGMLOP_H
-/// @file ggmlop.idl
-///
-//qidl copyright
-//qidl nested=false
+#ifndef _GGMLOP_AP_SKEL_H
+#define _GGMLOP_AP_SKEL_H
+
 #include <AEEStdDef.h>
 #include <remote.h>
 #include <string.h>
 #include <stdlib.h>
 
+
 #ifndef __QAIC_HEADER
 #define __QAIC_HEADER(ff) ff
 #endif //__QAIC_HEADER
@@ -241,12 +239,13 @@ typedef struct _cstring1_s {
 #define IDL_VERSION "0.0.1"
 typedef struct dsptensor dsptensor;
 struct dsptensor {
+   int32_t type;
    int64_t ne[4];
    int64_t nb[4];
+   int32_t op;
    int32_t flags;
-   int32_t type;
-   float* data;
-   int dataLen;
+   void * data;
+   int data_len;
 };
 /**
     * Opens the handle in the specified domain.  If this is the first
@@ -278,12 +277,12 @@ __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_open)(const char* uri, remote_hand
     * @retval, 0 on success, should always succeed
     */
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_close)(remote_handle64 h) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_add)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_mulmat)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_add)(remote_handle64 _h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_mulmat)(remote_handle64 _h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) __QAIC_HEADER_ATTRIBUTE;
 #ifndef ggmlop_URI
 #define ggmlop_URI "file:///libggmlop_skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1"
 #endif /*ggmlop_URI*/
 #ifdef __cplusplus
 }
 #endif
-#endif //_GGMLOP_H
+#endif //_GGMLOP_AP_SKEL_H
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
index 0350942648e2d..bddafa29ea81e 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
@@ -1,22 +1,16 @@
-/* ggml op functions, running on Hexagon cDSP as libggmlop_skel.so
- *
- * currently I didn't find a general approach to compile/build this hexagon-kernel file, a manual build approach can works fine in my local dev envs. I'm working on this build issue.
- *
- */
-
-#if 0
 #include <stdio.h>
 #include <stdlib.h>
+#include <math.h>
+#include <stdbool.h>
+#include <stdint.h>
 #include <assert.h>
 #include "HAP_farf.h"
-#include "ggmlop.h"
+#include "ggmlop_ap_skel.h"
 
-#define GGML_ASSERT(x)  do { } while(0)
-#define MIN(a, b)       ((a) < (b) ? (a) : (b))
-#define GGML_RESTRICT
+#define ggml_tensor dsptensor
 
-int ggmlop_open(const char * uri, remote_handle64 * handle) {
-    void * tptr = NULL;
+int ggmlop_open(const char*uri, remote_handle64* handle) {
+    void *tptr = NULL;
     FARF(HIGH, "uri %s", uri);
     tptr = (void *)malloc(1);
     *handle = (remote_handle64)tptr;
@@ -30,100 +24,338 @@ int ggmlop_close(remote_handle64 handle) {
     return 0;
 }
 
-int ggmlop_add(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
-    FARF(HIGH, "===============     DSP: ggmlop_add ");
-    for (size_t idx = 0; idx < src0->dataLen; idx++) {
-        dst->data[idx] = src0->data[idx] + src1->data[idx];
+static void ggml_dump_tensor(struct ggml_tensor * tensor) {
+    FARF(HIGH, "ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)\n",
+         tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
+         tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[3]);
+}
+
+static void ggml_abort(const char * file, int line, const char * fmt, ...) {
+    //abort();
+    return;
+}
+
+#define GGML_MAX_DIMS       4
+#define GGML_UNUSED(x)      (void)(x)
+#define GGML_PAD(x, n)      (((x) + (n) - 1) & ~((n) - 1))
+#define GGML_ABORT(...)     ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
+#define GGML_ASSERT(x)      if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
+#define MIN(a, b)           ((a) < (b) ? (a) : (b))
+#define MAX(a, b)           ((a) > (b) ? (a) : (b))
+#define GGML_RESTRICT
+
+#define static_assert(a, b) do { } while (0)
+
+#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
+    const type prefix##0 = (pointer)->array[0]; \
+    GGML_UNUSED(prefix##0);
+#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_1    (type, prefix, pointer, array) \
+    const type prefix##1 = (pointer)->array[1]; \
+    GGML_UNUSED(prefix##1);
+#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_2    (type, prefix, pointer, array) \
+    const type prefix##2 = (pointer)->array[2]; \
+    GGML_UNUSED(prefix##2);
+#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_3  (type, prefix, pointer, array) \
+    const type prefix##3 = (pointer)->array[3]; \
+    GGML_UNUSED(prefix##3);
+
+#define GGML_TENSOR_UNARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
+#define GGML_TENSOR_BINARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
+#define GGML_TENSOR_BINARY_OP_LOCALS01 \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
+enum ggml_type {
+    GGML_TYPE_F32     = 0,
+    GGML_TYPE_F16     = 1,
+    GGML_TYPE_Q4_0    = 2,
+    GGML_TYPE_Q4_1    = 3,
+    // GGML_TYPE_Q4_2 = 4, support has been removed
+    // GGML_TYPE_Q4_3 = 5, support has been removed
+    GGML_TYPE_Q5_0    = 6,
+    GGML_TYPE_Q5_1    = 7,
+    GGML_TYPE_Q8_0    = 8,
+    GGML_TYPE_Q8_1    = 9,
+    GGML_TYPE_Q2_K    = 10,
+    GGML_TYPE_Q3_K    = 11,
+    GGML_TYPE_Q4_K    = 12,
+    GGML_TYPE_Q5_K    = 13,
+    GGML_TYPE_Q6_K    = 14,
+    GGML_TYPE_Q8_K    = 15,
+    GGML_TYPE_IQ2_XXS = 16,
+    GGML_TYPE_IQ2_XS  = 17,
+    GGML_TYPE_IQ3_XXS = 18,
+    GGML_TYPE_IQ1_S   = 19,
+    GGML_TYPE_IQ4_NL  = 20,
+    GGML_TYPE_IQ3_S   = 21,
+    GGML_TYPE_IQ2_S   = 22,
+    GGML_TYPE_IQ4_XS  = 23,
+    GGML_TYPE_I8      = 24,
+    GGML_TYPE_I16     = 25,
+    GGML_TYPE_I32     = 26,
+    GGML_TYPE_I64     = 27,
+    GGML_TYPE_F64     = 28,
+    GGML_TYPE_IQ1_M   = 29,
+    GGML_TYPE_BF16    = 30,
+    // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
+    // GGML_TYPE_Q4_0_4_8 = 32,
+    // GGML_TYPE_Q4_0_8_8 = 33,
+    GGML_TYPE_TQ1_0   = 34,
+    GGML_TYPE_TQ2_0   = 35,
+    // GGML_TYPE_IQ4_NL_4_4 = 36,
+    // GGML_TYPE_IQ4_NL_4_8 = 37,
+    // GGML_TYPE_IQ4_NL_8_8 = 38,
+    GGML_TYPE_COUNT   = 39,
+};
+
+static bool ggml_is_empty(const struct ggml_tensor * tensor) {
+    for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+        if (tensor->ne[i] == 0) {
+            // empty if any dimension has no elements
+            return true;
+        }
     }
+    return false;
+}
 
-    return 0;
+static bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return ggml_is_empty(t0) ? ggml_is_empty(t1) :
+           (t1->ne[0]%t0->ne[0] == 0) &&
+           (t1->ne[1]%t0->ne[1] == 0) &&
+           (t1->ne[2]%t0->ne[2] == 0) &&
+           (t1->ne[3]%t0->ne[3] == 0);
 }
 
-static void ggmldsp_dump_tensor(struct dsptensor * src0) {
-    FARF(HIGH, "ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)\n",
-         src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-         src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
+static bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return
+            (t0->ne[0] == t1->ne[0]) &&
+            (t0->ne[1] == t1->ne[1]) &&
+            (t0->ne[2] == t1->ne[2]) &&
+            (t0->ne[3] == t1->ne[3]);
+}
+
+static int64_t ggml_nrows(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
 }
 
-static int ggmldsp_is_contiguous(const struct dsptensor * tensor) {
-    int n = 0;
-    size_t next_nb = sizeof(float);
-    if (tensor->ne[0] != 1 && tensor->nb[0] != next_nb) {
-        return 0;
+static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
+    size_t next_nb = sizeof(float);//ggml_type_size(tensor->type);
+    if (tensor->ne[0] != 1/*ggml_blck_size(tensor->type)*/ && tensor->nb[0] != next_nb) {
+        return false;
     }
-    next_nb *= tensor->ne[0];
-    for (int i = 1; i < 4; i++) {
+    next_nb *= tensor->ne[0]/1/*ggml_blck_size(tensor->type)*/;
+    for (int i = 1; i < GGML_MAX_DIMS; i++) {
         if (tensor->ne[i] != 1) {
             if (i > n) {
                 if (tensor->nb[i] != next_nb) {
-                    return 0;
+                    return false;
                 }
                 next_nb *= tensor->ne[i];
             } else {
-                next_nb = tensor->ne[i] * tensor->nb[i];
+                // this dimension does not need to be contiguous
+                next_nb = tensor->ne[i]*tensor->nb[i];
+            }
+        }
+    }
+    return true;
+}
+
+static bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
+    return ggml_is_contiguous_n(tensor, 0);
+}
+
+static bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
+    return ggml_is_contiguous_0(tensor);
+}
+
+inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
+
+static void ggml_compute_forward_add_f32(
+        const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+
+    ggml_dump_tensor(src0);
+    ggml_dump_tensor(src1);
+#if 1
+    float * a = (float*)src0->data;
+    float * b = (float*)src1->data;
+    float * c = (float*)dst->data;
+    //TODO: Hexagon SIMD
+    for (size_t idx = 0; idx < src0->data_len; idx++) {
+        *c = *a + *b;
+        a++;
+        b++;
+        c++;
+    }
+    return;
+#endif
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    const int ith = 0;
+    const int nth = 1;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(float)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
+            }
+        }
+    }
+}
+
+int ggmlop_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst)
+{
+    FARF(HIGH, "===============     DSP: ggmlop_add ");
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+        {
+            if (src1->type == GGML_TYPE_F32) {
+                ggml_compute_forward_add_f32(src0, src1, dst);
+            } else {
+                GGML_ABORT("fatal error");
+            }
+        } break;
+        case GGML_TYPE_F16:
+        {
+            if (src1->type == GGML_TYPE_F16) {
+                //ggml_compute_forward_add_f16_f16(dst);
+            }
+            else if (src1->type == GGML_TYPE_F32) {
+                //ggml_compute_forward_add_f16_f32(dst);
+            }
+            else {
+                GGML_ABORT("fatal error");
             }
+        } break;
+        case GGML_TYPE_BF16:
+        {
+            if (src1->type == GGML_TYPE_BF16) {
+                //ggml_compute_forward_add_bf16_bf16(dst);
+            }
+            else if (src1->type == GGML_TYPE_F32) {
+                //ggml_compute_forward_add_bf16_f32(dst);
+            }
+            else {
+                GGML_ABORT("fatal error");
+            }
+        } break;
+        case GGML_TYPE_Q4_0:
+        case GGML_TYPE_Q4_1:
+        case GGML_TYPE_Q5_0:
+        case GGML_TYPE_Q5_1:
+        case GGML_TYPE_Q8_0:
+        case GGML_TYPE_Q2_K:
+        case GGML_TYPE_Q3_K:
+        case GGML_TYPE_Q4_K:
+        case GGML_TYPE_Q5_K:
+        case GGML_TYPE_Q6_K:
+        case GGML_TYPE_TQ1_0:
+        case GGML_TYPE_TQ2_0:
+        case GGML_TYPE_IQ2_XXS:
+        case GGML_TYPE_IQ2_XS:
+        case GGML_TYPE_IQ3_XXS:
+        case GGML_TYPE_IQ1_S:
+        case GGML_TYPE_IQ1_M:
+        case GGML_TYPE_IQ4_NL:
+        case GGML_TYPE_IQ4_XS:
+        case GGML_TYPE_IQ3_S:
+        case GGML_TYPE_IQ2_S:
+        {
+            //ggml_compute_forward_add_q_f32(dst);
+        } break;
+        default:
+        {
+            GGML_ABORT("fatal error");
         }
     }
-    return 1;
+
+    return 0;
 }
 
-//FIXME: unknown issue on cDSP
-int ggmlop_mulmat(remote_handle64 h, const struct dsptensor * src00, const struct dsptensor * src10, dsptensor * dst) {
+
+int ggmlop_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     FARF(HIGH, "===============     DSP: ggmlop_mulmat ");
 
-    dsptensor * src0 = (dsptensor*)src00;
-    dsptensor * src1 = (dsptensor*)src10;
-    const int64_t ne00 = src0->ne[0];
-    (void) (ne00);
-    const int64_t ne01 = (src0)->ne[1];
-    (void) (ne01);
-    const int64_t ne02 = (src0)->ne[2];
-    (void) (ne02);
-    const int64_t ne03 = (src0)->ne[3];
-    (void) (ne03);
-    const size_t nb00 = (src0)->nb[0];
-    (void) (nb00);
-    const size_t nb01 = (src0)->nb[1];
-    (void) (nb01);
-    const size_t nb02 = (src0)->nb[2];
-    (void) (nb02);
-    const size_t nb03 = (src0)->nb[3];
-    (void) (nb03);
-    const int64_t ne10 = (src1)->ne[0];
-    (void) (ne10);
-    const int64_t ne11 = (src1)->ne[1];
-    (void) (ne11);
-    const int64_t ne12 = (src1)->ne[2];
-    (void) (ne12);
-    const int64_t ne13 = (src1)->ne[3];
-    (void) (ne13);
-    const size_t nb10 = (src1)->nb[0];
-    (void) (nb10);
-    const size_t nb11 = (src1)->nb[1];
-    (void) (nb11);
-    const size_t nb12 = (src1)->nb[2];
-    (void) (nb12);
-    const size_t nb13 = (src1)->nb[3];
-    (void) (nb13);
-    const int64_t ne0 = (dst)->ne[0];
-    (void) (ne0);
-    const int64_t ne1 = (dst)->ne[1];
-    (void) (ne1);
-    const int64_t ne2 = (dst)->ne[2];
-    (void) (ne2);
-    const int64_t ne3 = (dst)->ne[3];
-    (void) (ne3);
-    const size_t nb0 = (dst)->nb[0];
-    (void) (nb0);
-    const size_t nb1 = (dst)->nb[1];
-    (void) (nb1);
-    const size_t nb2 = (dst)->nb[2];
-    (void) (nb2);
-    const size_t nb3 = (dst)->nb[3];
-    (void) (nb3);
-
-    ggmldsp_dump_tensor(src0);
-    ggmldsp_dump_tensor(src1);
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    ggml_dump_tensor(src0);
+    ggml_dump_tensor(src1);
 
     const int vec_dot_type = 0;
     int64_t const vec_dot_num_rows = 1;
@@ -145,9 +377,12 @@ int ggmlop_mulmat(remote_handle64 h, const struct dsptensor * src00, const struc
     const int64_t nr1 = ne1 * ne2 * ne3;
 
     int chunk_size = 16;
+    int nth = 1;
+
     if (nr0 == 1 || nr1 == 1) {
         chunk_size = 64;
     }
+
     int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
     int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
 
@@ -155,6 +390,7 @@ int ggmlop_mulmat(remote_handle64 h, const struct dsptensor * src00, const struc
         nchunk0 = nr0 > nr1 ? nth : 1;
         nchunk1 = nr0 > nr1 ? 1 : nth;
     }
+
     const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
     const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
 
@@ -171,7 +407,7 @@ int ggmlop_mulmat(remote_handle64 h, const struct dsptensor * src00, const struc
 
     int64_t num_rows_per_vec_dot = vec_dot_num_rows;
 
-    const int src1_cont = ggmldsp_is_contiguous(src1);
+    const int src1_cont = ggml_is_contiguous(src1);
     const int64_t r2 = ne12 / ne02;
     const int64_t r3 = ne13 / ne03;
 
@@ -234,4 +470,3 @@ int ggmlop_mulmat(remote_handle64 h, const struct dsptensor * src00, const struc
 
     return 0;
 }
-#endif
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c
new file mode 100644
index 0000000000000..33d47174bf5ef
--- /dev/null
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c
@@ -0,0 +1,596 @@
+#include <string.h>
+#ifndef _WIN32
+#include "HAP_farf.h"
+#endif //_WIN32 for HAP_farf
+#ifndef _ALLOCATOR_H
+#define _ALLOCATOR_H
+
+#include <stdlib.h>
+#include <stdint.h>
+#include "version_note.h"
+#include "ggmlop_ap_skel.h"
+
+typedef struct _heap _heap;
+struct _heap {
+   _heap* pPrev;
+   const char* loc;
+   uint64_t buf;
+};
+
+typedef struct _allocator {
+   _heap* pheap;
+   uint8_t* stack;
+   uint8_t* stackEnd;
+   int nSize;
+} _allocator;
+
+_ATTRIBUTE_UNUSED
+static __inline int _heap_alloc(_heap** ppa, const char* loc, int size, void** ppbuf) {
+   _heap* pn = 0;
+   pn = MALLOC((size_t)size + sizeof(_heap) - sizeof(uint64_t));
+   if(pn != 0) {
+      pn->pPrev = *ppa;
+      pn->loc = loc;
+      *ppa = pn;
+      *ppbuf = (void*)&(pn->buf);
+      return 0;
+   } else {
+      return -1;
+   }
+}
+#define _ALIGN_SIZE(x, y) (((x) + (y-1)) & ~(y-1))
+
+_ATTRIBUTE_UNUSED
+static __inline int _allocator_alloc(_allocator* me,
+                                    const char* loc,
+                                    int size,
+                                    unsigned int al,
+                                    void** ppbuf) {
+   if(size < 0) {
+      return -1;
+   } else if (size == 0) {
+      *ppbuf = 0;
+      return 0;
+   }
+   if((_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + (size_t)size) < (uintptr_t)me->stack + (size_t)me->nSize) {
+      *ppbuf = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al);
+      me->stackEnd = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + size;
+      return 0;
+   } else {
+      return _heap_alloc(&me->pheap, loc, size, ppbuf);
+   }
+}
+
+_ATTRIBUTE_UNUSED
+static __inline void _allocator_deinit(_allocator* me) {
+   _heap* pa = me->pheap;
+   while(pa != 0) {
+      _heap* pn = pa;
+      const char* loc = pn->loc;
+      (void)loc;
+      pa = pn->pPrev;
+      FREE(pn);
+   }
+}
+
+_ATTRIBUTE_UNUSED
+static __inline void _allocator_init(_allocator* me, uint8_t* stack, int stackSize) {
+   me->stack =  stack;
+   me->stackEnd =  stack + stackSize;
+   me->nSize = stackSize;
+   me->pheap = 0;
+}
+
+
+#endif // _ALLOCATOR_H
+
+#ifndef SLIM_H
+#define SLIM_H
+
+#include <stdint.h>
+
+//a C data structure for the idl types that can be used to implement
+//static and dynamic language bindings fairly efficiently.
+//
+//the goal is to have a minimal ROM and RAM footprint and without
+//doing too many allocations.  A good way to package these things seemed
+//like the module boundary, so all the idls within  one module can share
+//all the type references.
+
+
+#define PARAMETER_IN       0x0
+#define PARAMETER_OUT      0x1
+#define PARAMETER_INOUT    0x2
+#define PARAMETER_ROUT     0x3
+#define PARAMETER_INROUT   0x4
+
+//the types that we get from idl
+#define TYPE_OBJECT             0x0
+#define TYPE_INTERFACE          0x1
+#define TYPE_PRIMITIVE          0x2
+#define TYPE_ENUM               0x3
+#define TYPE_STRING             0x4
+#define TYPE_WSTRING            0x5
+#define TYPE_STRUCTURE          0x6
+#define TYPE_UNION              0x7
+#define TYPE_ARRAY              0x8
+#define TYPE_SEQUENCE           0x9
+
+//these require the pack/unpack to recurse
+//so it's a hint to those languages that can optimize in cases where
+//recursion isn't necessary.
+#define TYPE_COMPLEX_STRUCTURE  (0x10 | TYPE_STRUCTURE)
+#define TYPE_COMPLEX_UNION      (0x10 | TYPE_UNION)
+#define TYPE_COMPLEX_ARRAY      (0x10 | TYPE_ARRAY)
+#define TYPE_COMPLEX_SEQUENCE   (0x10 | TYPE_SEQUENCE)
+
+
+typedef struct Type Type;
+
+#define INHERIT_TYPE\
+   int32_t nativeSize;                /*in the simple case its the same as wire size and alignment*/\
+   union {\
+      struct {\
+         const uintptr_t         p1;\
+         const uintptr_t         p2;\
+      } _cast;\
+      struct {\
+         uint32_t  iid;\
+         uint32_t  bNotNil;\
+      } object;\
+      struct {\
+         const Type  *arrayType;\
+         int32_t      nItems;\
+      } array;\
+      struct {\
+         const Type *seqType;\
+         int32_t      nMaxLen;\
+      } seqSimple; \
+      struct {\
+         uint32_t bFloating;\
+         uint32_t bSigned;\
+      } prim; \
+      const SequenceType* seqComplex;\
+      const UnionType  *unionType;\
+      const StructType *structType;\
+      int32_t         stringMaxLen;\
+      uint8_t        bInterfaceNotNil;\
+   } param;\
+   uint8_t    type;\
+   uint8_t    nativeAlignment\
+
+typedef struct UnionType UnionType;
+typedef struct StructType StructType;
+typedef struct SequenceType SequenceType;
+struct Type {
+   INHERIT_TYPE;
+};
+
+struct SequenceType {
+   const Type *         seqType;
+   uint32_t               nMaxLen;
+   uint32_t               inSize;
+   uint32_t               routSizePrimIn;
+   uint32_t               routSizePrimROut;
+};
+
+//byte offset from the start of the case values for
+//this unions case value array.  it MUST be aligned
+//at the alignment requrements for the descriptor
+//
+//if negative it means that the unions cases are
+//simple enumerators, so the value read from the descriptor
+//can be used directly to find the correct case
+typedef union CaseValuePtr CaseValuePtr;
+union CaseValuePtr {
+   const uint8_t*   value8s;
+   const uint16_t*  value16s;
+   const uint32_t*  value32s;
+   const uint64_t*  value64s;
+};
+
+//these are only used in complex cases
+//so I pulled them out of the type definition as references to make
+//the type smaller
+struct UnionType {
+   const Type           *descriptor;
+   uint32_t               nCases;
+   const CaseValuePtr   caseValues;
+   const Type * const   *cases;
+   int32_t               inSize;
+   int32_t               routSizePrimIn;
+   int32_t               routSizePrimROut;
+   uint8_t                inAlignment;
+   uint8_t                routAlignmentPrimIn;
+   uint8_t                routAlignmentPrimROut;
+   uint8_t                inCaseAlignment;
+   uint8_t                routCaseAlignmentPrimIn;
+   uint8_t                routCaseAlignmentPrimROut;
+   uint8_t                nativeCaseAlignment;
+   uint8_t              bDefaultCase;
+};
+
+struct StructType {
+   uint32_t               nMembers;
+   const Type * const   *members;
+   int32_t               inSize;
+   int32_t               routSizePrimIn;
+   int32_t               routSizePrimROut;
+   uint8_t                inAlignment;
+   uint8_t                routAlignmentPrimIn;
+   uint8_t                routAlignmentPrimROut;
+};
+
+typedef struct Parameter Parameter;
+struct Parameter {
+   INHERIT_TYPE;
+   uint8_t    mode;
+   uint8_t  bNotNil;
+};
+
+#define SLIM_IFPTR32(is32,is64) (sizeof(uintptr_t) == 4 ? (is32) : (is64))
+#define SLIM_SCALARS_IS_DYNAMIC(u) (((u) & 0x00ffffff) == 0x00ffffff)
+
+typedef struct Method Method;
+struct Method {
+   uint32_t                    uScalars;            //no method index
+   int32_t                     primInSize;
+   int32_t                     primROutSize;
+   int                         maxArgs;
+   int                         numParams;
+   const Parameter * const     *params;
+   uint8_t                       primInAlignment;
+   uint8_t                       primROutAlignment;
+};
+
+typedef struct Interface Interface;
+
+struct Interface {
+   int                            nMethods;
+   const Method  * const          *methodArray;
+   int                            nIIds;
+   const uint32_t                   *iids;
+   const uint16_t*                  methodStringArray;
+   const uint16_t*                  methodStrings;
+   const char*                    strings;
+};
+
+
+#endif //SLIM_H
+
+
+#ifndef _GGMLOP_SLIM_H
+#define _GGMLOP_SLIM_H
+#include <stdint.h>
+
+#ifndef __QAIC_SLIM
+#define __QAIC_SLIM(ff) ff
+#endif
+#ifndef __QAIC_SLIM_EXPORT
+#define __QAIC_SLIM_EXPORT
+#endif
+
+static const Type types[5];
+static const Type* const typeArrays[6] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[0]),&(types[3])};
+static const StructType structTypes[1] = {{0x6,&(typeArrays[0]),0x58,0x4,0x50,0x8,0x4,0x8}};
+static const Type types[5] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x20,{{(const uintptr_t)&(types[2]),(const uintptr_t)0x4}}, 8,0x8},{0x8,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x8},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[4]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
+static const Parameter parameters[5] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x58,0x60),{{(const uintptr_t)&(structTypes[0]),0}}, 22,0x8,0,0},{SLIM_IFPTR32(0x58,0x60),{{(const uintptr_t)&(structTypes[0]),0}}, 22,0x8,3,0}};
+static const Parameter* const parameterArrays[6] = {(&(parameters[3])),(&(parameters[3])),(&(parameters[4])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))};
+static const Method methods[3] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[3])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[5])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0xb4,0x50,3,3,(&(parameterArrays[0])),0x8,0x8}};
+static const Method* const methodArrays[4] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[2])};
+static const char strings[68] = "mulmat\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0add\0uri\0op\0nb\0ne\0h\0";
+static const uint16_t methodStrings[49] = {0,34,29,62,59,56,7,24,19,29,62,59,56,7,24,44,29,62,59,56,7,24,48,34,29,62,59,56,7,24,19,29,62,59,56,7,24,44,29,62,59,56,7,24,39,52,65,13,65};
+static const uint16_t methodStringsArrays[4] = {44,47,22,0};
+__QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {4,&(methodArrays[0]),0,0,&(methodStringsArrays [0]),methodStrings,strings};
+#endif //_GGMLOP_SLIM_H
+extern int adsp_mmap_fd_getinfo(int, uint32_t *);
+#ifdef __cplusplus
+extern "C" {
+#endif
+_ATTRIBUTE_VISIBILITY uint32_t ggmlop_skel_handle_invoke_qaic_version = 10048;
+_ATTRIBUTE_VISIBILITY char ggmlop_skel_handle_invoke_uri[77+1]="file:///libggmlop_skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1";
+static __inline int _skel_pack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint64_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
+   int _nErr = 0;
+   remote_arg* _praROutPostStart = _praROutPost;
+   remote_arg** _ppraROutPostStart = _ppraROutPost;
+   _ppraROutPost = &_praROutPost;
+   _COPY(_primROut, 0, _rout0, 0, 4);
+   _COPY(_primROut, 8, _rout1, 0, 32);
+   _COPY(_primROut, 40, _rout2, 0, 32);
+   _COPY(_primROut, 72, _rout3, 0, 4);
+   _COPY(_primROut, 76, _rout4, 0, 4);
+   _ppraROutPostStart[0] += (_praROutPost - _praROutPostStart) +1;
+   return _nErr;
+}
+static __inline int _skel_unpack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint64_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
+   int _nErr = 0;
+   remote_arg* _praInStart = _praIn;
+   remote_arg** _ppraInStart = _ppraIn;
+   remote_arg* _praROutStart = _praROut;
+   remote_arg** _ppraROutStart = _ppraROut;
+   _ppraIn = &_praIn;
+   _ppraROut = &_praROut;
+   _COPY(_rout5Len, 0, _primIn, 0, 4);
+   _QAIC_ASSERT(_nErr, ((_praROut[0].buf.nLen / 4)) >= (size_t)(_rout5Len[0]));
+   _rout5[0] = _praROut[0].buf.pv;
+   _ppraInStart[0] += (_praIn - _praInStart) + 0;
+   _ppraROutStart[0] += (_praROut - _praROutStart) +1;
+   _QAIC_CATCH(_nErr) {}
+   return _nErr;
+}
+static __inline int _skel_unpack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint64_t _in1[4], _ATTRIBUTE_UNUSED uint64_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[1], _ATTRIBUTE_UNUSED char* _in5[1], _ATTRIBUTE_UNUSED uint32_t _in5Len[1]) {
+   int _nErr = 0;
+   remote_arg* _praInStart = _praIn;
+   remote_arg** _ppraInStart = _ppraIn;
+   remote_arg* _praROutStart = _praROut;
+   remote_arg** _ppraROutStart = _ppraROut;
+   _ppraIn = &_praIn;
+   _ppraROut = &_praROut;
+   _COPY(_in0, 0, _primIn, 0, 4);
+   _COPY(_in1, 0, _primIn, 8, 32);
+   _COPY(_in2, 0, _primIn, 40, 32);
+   _COPY(_in3, 0, _primIn, 72, 4);
+   _COPY(_in4, 0, _primIn, 76, 4);
+   _COPY(_in5Len, 0, _primIn, 80, 4);
+   _QAIC_ASSERT(_nErr, ((_praIn[0].buf.nLen / 4)) >= (size_t)(_in5Len[0]));
+   _in5[0] = _praIn[0].buf.pv;
+   _ppraInStart[0] += (_praIn - _praInStart) + 1;
+   _ppraROutStart[0] += (_praROut - _praROutStart) +0;
+   _QAIC_CATCH(_nErr) {}
+   return _nErr;
+}
+static __inline int _skel_method(int (*_pfn)(remote_handle64, const dsptensor*, const dsptensor*, dsptensor*), remote_handle64 _h, uint32_t _sc, remote_arg* _pra) {
+   remote_arg* _praEnd = 0;
+   uint64_t _in0[SLIM_IFPTR32(11, 12)] = {0};
+   uint64_t _in1[SLIM_IFPTR32(11, 12)] = {0};
+   uint64_t _rout2[SLIM_IFPTR32(11, 12)] = {0};
+   uint64_t* _primIn= 0;
+   int _numIn[1] = {0};
+   uint64_t* _primROut= 0;
+   int _numInH[1] = {0};
+   int _numROut[1] = {0};
+   remote_arg* _praIn = 0;
+   remote_arg* _praROut = 0;
+   remote_arg* _praROutPost = 0;
+   remote_arg** _ppraROutPost = &_praROutPost;
+   _allocator _al[1] = {{0}};
+   remote_arg** _ppraIn = &_praIn;
+   remote_arg** _ppraROut = &_praROut;
+   remote_arg* _praHIn = 0;
+   remote_arg** _ppraHIn = &_praHIn;
+   remote_arg* _praHROut = 0;
+   remote_arg** _ppraHROut = &_praHROut;
+   int _nErr = 0;
+   _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc));
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)>=1);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)>=1);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==0);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==0);
+   _QAIC_ASSERT(_nErr, (_pra + ((1 + 1) + (((0 + 0) + 0) + 0))) <= _praEnd);
+   _numIn[0] = (REMOTE_SCALARS_INBUFS(_sc) - 1);
+   _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 180);
+   _primIn = _pra[0].buf.pv;
+   _QAIC_ASSERT(_nErr, _pra[(_numIn[0] + 1)].buf.nLen >= 80);
+   _primROut = _pra[(_numIn[0] + 1)].buf.pv;
+   _numInH[0] = REMOTE_SCALARS_INHANDLES(_sc);
+   _numROut[0] = REMOTE_SCALARS_OUTBUFS(_sc);
+   _praIn = (_pra + 1);
+   _praROut = (_praIn + _numIn[0] + 1);
+   _praROutPost = _praROut;
+   _allocator_init(_al, 0, 0);
+   if(_praHIn == 0)
+   {
+      _praHIn = ((_praROut + _numROut[0]) + 1);
+   }
+   if(_praHROut == 0)
+      (_praHROut = _praHIn + _numInH[0] + 0);
+   _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint64_t*)&(((uint64_t*)_in0)[1]), (uint64_t*)&(((uint64_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[18]), (uint32_t*)&(((uint32_t*)_in0)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[20]), (char**)&(((uint64_t*)_in0)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[21]), (uint32_t*)&(((uint32_t*)_in0)[22]))));
+   _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 88), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint64_t*)&(((uint64_t*)_in1)[1]), (uint64_t*)&(((uint64_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[18]), (uint32_t*)&(((uint32_t*)_in1)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[20]), (char**)&(((uint64_t*)_in1)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[21]), (uint32_t*)&(((uint32_t*)_in1)[22]))));
+   _TRY(_nErr, _skel_unpack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 176), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[1]), (uint64_t*)&(((uint64_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[18]), (uint32_t*)&(((uint32_t*)_rout2)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[20]), (char**)&(((uint64_t*)_rout2)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[21]), (uint32_t*)&(((uint32_t*)_rout2)[22]))));
+   _TRY(_nErr, _pfn(_h, (const dsptensor*)_in0, (const dsptensor*)_in1, (dsptensor*)_rout2));
+   _TRY(_nErr, _skel_pack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[1]), (uint64_t*)&(((uint64_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[18]), (uint32_t*)&(((uint32_t*)_rout2)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[20]), (char**)&(((uint64_t*)_rout2)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[21]), (uint32_t*)&(((uint32_t*)_rout2)[22]))));
+   _QAIC_CATCH(_nErr) {}
+   _allocator_deinit(_al);
+   return _nErr;
+}
+static __inline int _skel_method_1(int (*_pfn)(remote_handle64), uint32_t _sc, remote_arg* _pra) {
+   remote_arg* _praEnd = 0;
+   remote_handle64 _in0[1] = {0};
+   remote_arg* _praRHandleIn = _pra + REMOTE_SCALARS_INBUFS(_sc) +  REMOTE_SCALARS_OUTBUFS(_sc);
+   int _nErr = 0;
+   _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc));
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)==0);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)==0);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==1);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==0);
+   _QAIC_ASSERT(_nErr, (_pra + ((0 + 0) + (((1 + 0) + 0) + 0))) <= _praEnd);
+   _COPY(_in0, 0, &(_praRHandleIn[0].h64), 0, sizeof(remote_handle64));
+   _TRY(_nErr, _pfn((remote_handle64)*_in0));
+   _QAIC_CATCH(_nErr) {}
+   return _nErr;
+}
+static __inline int _compare_versions(char* stub_ver, char* skel_ver, int* result) {
+   unsigned long int major_stub = 0, minor_stub = 0, patch_stub = 0;
+   unsigned long int major_skel = 0, minor_skel = 0, patch_skel = 0;
+   char *saveptr1 = NULL;
+   char *token1 = NULL;
+   char *saveptr2 = NULL;
+   char *token2 = NULL;
+   int i=0;
+   for (i=0, token1 = strtok_r(stub_ver, ".", &saveptr1); i<3 && token1 != NULL; i++, token1 = strtok_r(NULL, ".", &saveptr1))
+   {
+      unsigned long int tn = strtoul(token1, NULL,10);
+      if( tn > 999)
+      {
+         *result=-1;
+         return 0;
+      }
+      else
+      {
+         if(i==0) major_stub=tn;
+         if(i==1) minor_stub=tn;
+         if(i==2) patch_stub=tn;
+      }
+   }
+   for (i=0, token2 = strtok_r(skel_ver, ".", &saveptr2); i<3 && token2 != NULL; i++, token2 = strtok_r(NULL, ".", &saveptr2))
+   {
+      unsigned long int tn = strtoul(token2, NULL,10);
+      if( tn > 999)
+      {
+         *result=-1;
+         return 0;
+      }
+      else
+      {
+         if(i==0) major_skel=tn;
+         if(i==1) minor_skel=tn;
+         if(i==2) patch_skel=tn;
+      }
+   }
+   if(major_stub<major_skel)
+   {
+      *result=1;
+      return 0;
+   }
+   else if(major_stub==major_skel)
+   {
+      if( minor_stub < minor_skel )
+      {
+         *result=1;
+         return 0;
+      }
+      else if((minor_stub == minor_skel) && (patch_skel>=patch_stub))
+      {
+         *result=1;
+         return 0;
+      }
+   }
+   *result=-1;
+   return 0;
+}
+static __inline int _stub_skel_version_check(char*_in0, int* resVal) {
+   int _nErr = 0;
+   char* p = strstr(_in0, "_idlver=");
+   if(!p)
+   {
+      *resVal = -1;
+      return 0;
+   }
+   p+=8;
+   int i=0,len=0, comVer=0,num_delimit=0, updtInxStub=0, updtInxSkel=0;
+   for(i=0;i<strlen(p);i++)
+   {
+      if(num_delimit>2)
+      {
+         *resVal = -1;
+         return 0;
+      }
+      if ((p[i]>='0' && p[i]<='9') || (p[i]=='.'))
+      {
+         len++;
+         if(p[i]=='.')
+         {
+            num_delimit++;
+         }
+      }
+      else if(p[i]=='&')
+      {
+         break;
+      }
+      else
+      {
+         *resVal = -1;
+         return 0;
+      }
+   }
+   char* stubVer=(char*)MALLOC(len+1);
+   _QAIC_ASSERT(_nErr, stubVer!=NULL);
+   for(i=0;i<strlen(p);i++)
+   {
+      if((p[i]>='0' && p[i]<='9') || (p[i]=='.'))
+      {
+         stubVer[updtInxStub]=p[i];
+         updtInxStub++;
+      }
+      else if(p[i]=='&')
+      {
+         break;
+      }
+   }
+   stubVer[len]='\0';
+   char* skelVer=(char*)MALLOC(strlen(IDL_VERSION)+1);
+   _QAIC_ASSERT(_nErr, skelVer!=NULL);
+   for(i=0;i< strlen(IDL_VERSION);i++)
+   {
+      skelVer[updtInxSkel]=IDL_VERSION[i];
+      updtInxSkel++;
+   }
+   skelVer[strlen(IDL_VERSION)]='\0';
+   _TRY(_nErr, _compare_versions(stubVer, skelVer, &comVer));
+   *resVal = 0;
+   if (comVer==-1)
+   {
+      *resVal = -1;
+   }
+   FREE(stubVer);
+   FREE(skelVer);
+   _QAIC_CATCH(_nErr) {}
+   return 0;
+}
+static __inline int _skel_method_2(int (*_pfn)(const char*, remote_handle64*), uint32_t _sc, remote_arg* _pra) {
+   remote_arg* _praEnd = 0;
+   char* _in0[1] = {0};
+   uint32_t _in0Len[1] = {0};
+   remote_handle64 _rout1[1] = {0};
+   uint32_t* _primIn= 0;
+   remote_arg* _praRHandleROut = _pra + REMOTE_SCALARS_INBUFS(_sc) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) ;
+   remote_arg* _praIn = 0;
+   int _nErr = 0;
+   _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc));
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)==2);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)==0);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==0);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==1);
+   _QAIC_ASSERT(_nErr, (_pra + ((2 + 0) + (((0 + 1) + 0) + 0))) <= _praEnd);
+   _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 4);
+   _primIn = _pra[0].buf.pv;
+   _COPY(_in0Len, 0, _primIn, 0, 4);
+   _praIn = (_pra + 1);
+   _QAIC_ASSERT(_nErr, ((_praIn[0].buf.nLen / 1)) >= (size_t)(_in0Len[0]));
+   _in0[0] = _praIn[0].buf.pv;
+   _QAIC_ASSERT(_nErr, (_in0Len[0] > 0) && (_in0[0][(_in0Len[0] - 1)] == 0));
+   int resVal;
+   _TRY(_nErr, _stub_skel_version_check(*_in0, &resVal));
+   if(resVal==-1)
+   {
+      return AEE_ESTUBSKELVERMISMATCH;
+   }
+   _TRY(_nErr, _pfn((const char*)*_in0, (remote_handle64*)_rout1));
+   _COPY(&(_praRHandleROut[0].h64), 0, _rout1, 0, sizeof(remote_handle64));
+   _QAIC_CATCH(_nErr) {}
+   return _nErr;
+}
+__QAIC_SKEL_EXPORT int __QAIC_SKEL(ggmlop_skel_handle_invoke)(remote_handle64 _h, uint32_t _sc, remote_arg* _pra) __QAIC_SKEL_ATTRIBUTE {
+   switch(REMOTE_SCALARS_METHOD(_sc)){
+      case 0:
+      return _skel_method_2(__QAIC_IMPL(ggmlop_open), _sc, _pra);
+      case 1:
+      return _skel_method_1(__QAIC_IMPL(ggmlop_close), _sc, _pra);
+      case 2:
+      return _skel_method(__QAIC_IMPL(ggmlop_add), _h, _sc, _pra);
+      case 3:
+      return _skel_method(__QAIC_IMPL(ggmlop_mulmat), _h, _sc, _pra);
+   }
+   return AEE_EUNSUPPORTED;
+}
+
+/* Library version needs to be added in the name member of note_type structure in below format
+ * "lib.ver.1.0.0." + "<library_name>" + ":" + "<version>"
+ */
+const lib_ver_note_t so_ver __attribute__ ((section (".note.lib.ver")))
+        __attribute__ ((visibility ("default"))) = {
+                100,
+                0,
+                0,
+                "lib.ver.1.0.0.libggmlop_skel.so:4.5.0",
+        };
diff --git a/ggml/src/ggml-qnn/kernels/libggmlop_skel.so b/ggml/src/ggml-qnn/kernels/libggmlop_skel.so
index 9d4be24f3263907fa6c2bf83eacbf6fe17941dd9..8dcdba3c2fd271b8a8ec61bc77f7b443bf6a9c1c 100755
GIT binary patch
delta 5270
zcmcIo4RBM}l|JuEmSsag7P9;=o^9C%#~2xmVZnHxo^6n+Nt6sjoZ<<#fy6b0sKEg@
zH9U{`31kJ!%Nw#xsawQOlk6s&)yA2W>4f$nD;YyGX-1|Vl8sWu(99%h*CZ4slS$(J
zt|SxTrqi@LduOiB`OdlLzI*OH=jh!xy5sQn<03m<qn4DIt`-tqE&Q)_D+Sh{p&b$&
zX|7Bnr}l5cD(lG%qLV~qr3?y!N*UTsVV_+rAZm(>Gii-L)L|idE_3)vw#1PC8FZIK
zqEhGyQ(sQ<-G$nC+B2(T#Y8$fS93m`<lh1vP2t~2@>GJ=CHZUGf0&w-JU%he3s98g
zB={ibtqJ~R&KnZ^S<agi{HvVLP4GJB;TmsY&Sxd~QO>gj{|4vhCHS{EKPSO!{pq6V
zLuG0%g@(_h9}^r2@z=kiM12;i(`PCC)5nibY5!xlD;^1Ixuu8Z8yh$;w6^s4rl#&^
zI`xx05Xl^+_8rR9CenvaJ9Ni!{U6DbCJm}4ojDWye)X%^qt0jKLG^QeiF2Xb<S&;@
zyAR3-)F(pwRY5xPm{EGjcQ`btvS#xQ$9b{cB4u=tPD+ckUpdvPA2^lKGW%mo<jnJC
zX^Ssi+Ui><9af9=C)A81!pCV+mruC#v%tlU-B~1&Lehc8G<|;~pTN*eACvUcdm-;L
zkDqz|nxgc7<%>T14_Iug6jp05M*}&~b%0+s9t|7<T^@-B%&pSztHIY@niUOXtf$F8
zdFPBR(LTvprF7^aHBC0{8jwQjd_5X4xkFe=G+>v0)>|%(sOhgq18YDXpbvu120a9t
z2l_gw2>Jo20aV8vEG`eL8JHsmYzK~Qw2d+Mh-!w6ZOpmx0hVG6MFVi-PQ~O7tJxTI
zdLK4D?k_1e;Lp6dWBHPBgBujVujBi)NN!zbFNVcJwlUv6&6-!4aT?3#$JV4Z=iMo+
z(7vCyFuij-_AFAV1@dy&pBDk7BZU&RRa`n35Zs;H^TSb<N$w-oq>S{A?F?fwlbGu&
zBIg2MIo@daVlNiu&AP!ZW#72DSIf*V7eZQ1eu3~yZGC=D-pTJ#`yFs33ws`MyxlM!
zJ~Jk59EkX}z4;Z+1M&g&4osBMXqq%4yey;9K)!@9lg$0B($F8%jbqxo`K`>iSF0>o
zK5v6K#t+%cpPAeUt-HWkvjT$8Uw4kthU^=5^lO#sd(~e~ZJKBrQs=)D4G5BaF!O$D
zAG|QMmLXI_+PMO!Q#SrFJ_}~d?V$cE%|q&sMxyF5dB1vPWJo2c@!&@7%Yss2zgApW
zzk)vyL*3q|A}+}iTp%4fwNQWJ)Im?i<Y({yu07U9W2T9s;DGd_h{Gecr(Hd?$}K$|
zc}G(V3)sJZp}ko6U3TaTEwjkMZeG?Zi{zquWR>*Yrz6ZUtKow-)7W$R@7g1d?gn?$
zK;)!$q^ML`#B*`HrKg(b;hL77IbTd|;yjJE^l+Wf-qK@)43b+d3n{}xej!4hHFq7g
zAzjKhL<40AcZ*EspS!8ASdt&BK-gcCa?ZB9somKY4U}S|upj*&=vE8uo0{wDSZ?iI
zC=r>@%U1S)ZTAB3-I|(FP{X+ey6iM~YL8BAX1hOaimI1*&@{5^uYi9X{FZxIc(AKx
zrRBA#dQyvKELBSSr|#!FXv9OTh+7M0X$vt6xr>73e8v&kDj-hSMe6haaB5Rm<}W^C
zQm@aY=bSwg-f)cWK6CJuQP$2p8;+J?yaxq`ui&cNwwq&@Zgd`OvK=*q4cE(L!z7o{
zdAV%MIQ5lEs-ij}=(r*mKZd2{+*cHwxIld+(uug-H6a(jOFtpOZ4X|tPRwg+WBync
zTNm~=9&7fHpWHb?gI=c^8(*9BVsX{Tm35%hoD&q9?Lii%KJJ{{Qm1aaSf_3Qb%Qpx
z8T?N0<rkGYb;K(lYw{TV1ObE6AcZHJ+S2?ICJTGr$0U!*|2}jN>Z)p#^~uII!LI`c
zE^u4vSb50gZ}td&TC?@j0mvJ}Lji;I+GLf)+TEBVr?n{9`JFmdhJ6!kg;sm8k>k)%
zAgsU>)<k_G1cqPNsncFf=K%NY>f`VTfu1{lF7Phrxp&@#ck%{!_FH%c>IRK_2EH8p
zEysWv@U&x<DUK=cL-|aqXTXc_ptxtU$sYh7xWpHRmBT}Z<LY{|ZY}rl)lQ-PLC=Zi
zJ276y(G+8R<HkDGIbNrR;FyG6O#{!G^~nKARjW`{1|FUFWfA6}PL`b$8d3rVY>%^Q
zqIw0+tr_QT#~_Q;;iJylUerHY5tFPI1Y@Iw><_&hD8bnkdkua=_~k&U$KV&-mf(3S
z>+xb|gb`bymD6TRkeY3Y^T*Ed7(ih64a!5R2XmBSv7(pxzY|t4p@;d`>9#<NYzv$T
z4XMRRWoyV5*WK;51@3~rsgc^&LpFiP=Fy*r#rfQ_B?o^GMnt>;8Ng+2A=&wK*mg8+
zqw^DvUC7D0uwjzP_Ujy5LULINb|V*jpZ4XP`3mP;S6MNqxe~M7tE+2Sgk7Id*uUU^
zk@{k?(*ep0%ey~70O*hUY;Fo(^W~i7dPyfsw%*~3#C5&CGm^zR;$jQna!0OCeRhoL
zmRXijFI}rQ8>4BTU;WkZH>d43eJGb5kBLhAih<l<iNtzKbwe;0d+&Es+jIo-LcrFc
z0MrW$2JwJzDgZBx3h@GvjR%P-1%PAoi~?YTtnKRppxPt_hx`o%fDbhP4FzBg7RL*K
z#CIX30Q}|TPo{|S|B(DnN%-r@pUi#bw&Xvb6XZ!B>zmq?3vb>5O-=ZCIYUnVUy^=C
z&MSWXZ%F#Ho%DAl{f|4XJmGKU`$uprk?+da3Ga`2zg@D!IG*gBe(Pla3%R}smEg~k
z9T~ur9Vyvo)~QWzGDhn^6Wg!u(bn14FeL6%w(<vF*fX_B#|ZxN$}SJDvLScQ>SUdZ
z&=s0mdTL~<fAKuk2VH}UTy9(ai@L4eCE4l+Lbm#G*e*8z^M?$!cuN0DB=H>BWqVqB
z1nkqBdo+9Y%Jk<j?DFbC?Y``?y4if%yIOj5S4ov0avlqKGwUlr=QUs!H-Dg|Cr`(6
zAR}IH_h>I=*Rl3J+DF+9WyZMGE+55duWtwFY1jMJUH90-yy(`W*q+ta+c&wwyZQXP
ze0rSr`bhHmEX`5%Q@ll7!rR0vpeI53TLms0PphBg-XY-ShW>jfFjvp$nb7`Nv`TsC
zp@(-o;@iAs>n>mQmX$Tj#mer^om+Qoqhw3!AMtJ58JFBXzP6@L+_|l0iC6~WS|YAq
zExHnh$#D-qw&UT>UWjJeE$4PiMI~;yG{vxr7Lz?bM+p^C1FfPJluo%c?>1C*8*<?W
z`EAEmP%XLX4s_nMLync1kN9Y37wz1J*R+d{a_zb5$F*Sfqu<n@5;C6>tC(1zRf`Q-
zMVP>bo{r9MsWEM@m?Ly+C&X2m{EE!4#<&u{Bg$zn19`RL;>}7MxYM}o@@B`q^>%C^
z6Xa*1=W-*u7u{?$6Tdw3%QWwt=N)dx$ABXNoxwl^y#^h(HDmtd)i|Ly)9_UBy=-`=
zGr+W}iUvpOu{<KCpv}9sZ`(0^rJ{jp&dTl*{xyI%17*f(_bogtj$?(|i<M6*&jWB3
zzrPDKi(Udf3_LrDFMyr|=3fT5ehgFx=J#%n{{Z?{ivA`r-%0KS*JpAsv8!E4oD19y
z+?2oyi4X)K;1_>xPysv&%#zpz90N|Y+0-}#Z{YSK5k7XnehzhVJ@AGk{vPm(V!Q+-
z@ehE#!2Fa1Qs&SB2>9_%4>%6Yk8>KI0_I1TFqlID;65B<qFLxafe+upS@bIK@gx@5
zJc<BMH@?XCQZfFg=?^IaEAaIh`12W9;9HYye7t!E&YFSqW?&I`TBKO$lc|~%0mqHN
zJpB97v#?3)fCrE=JAkc}PF`-!&kK!zgW3wbAc-x&-v^d}>y!Gu!2Q7d6#4ikfs@~`
zl(^so1o6*XNF_etUx5=}uM+qiaO&5s5nx?ROqhi;{|(odBr+(A=F&~z1;BUVfgIcL
z<*6T-Hw*4z#lQn`yp{{HXfXtDPY=KjECZfye2cfw!2KzhoBfk^r22GTq9msl&Uj8%
zZ_x%8R|vJ*vx|#a%`$Dx;#I>J7kdP)s^(7Zcun5$#+pJwlLVvYUlK$yTD^3BS`0f8
z(>j;_keRL8drKWA?mmhbt1Zdk>B@VTwy1W#iRVVFaQNO@SrD?d6U$23hZWi{mX))g
mF3={Hl}o;b@D-(nhcj5>RfdP*1@sgz=-MKpXHx{zLHED+M+R8{

delta 6105
zcmcgwdr(`~oj&*Kfp`m9h=+NIr-9m7U|V(#p;uP|3!69!!@5dfV;;eF@k?V9f)nhk
z^(%HNdwhKx_u-*MPMf-En^gu+*E^H67oiaBb#_u<Q#XxT47k%wrWL76*PGd0_B)cG
z;BFp&G(E%N_xpa2bI<eBz30r{(R~+;Y-^>MpRYcdOUTLGSGR6v*sL_^G?SycmPUwC
z`T<j=+XStT5TYZo#0NQPkp9dZGfZX^Vhfw&NF76nDVC6{iG$Cxm5R)g96~0HgcQJU
zf_^)ypUIKlQN2|0b{-)zDs|672A1ffQJv(YiKsp){j1t0Xb_F^BRKN$ELwb2Pl@Q)
zsLn?8>r~H(=&w<IMMRgWo)pmoRM$oHQL1B@wA`Y)BBH-d_4J4?{-;)Q$7HGR3G>Nh
zWbjJNd8RFe>u~+AEBO5)xqpDIZ9k{y+FkigL2uqWL9do4nt!p7-aL+fs$eYWo#oQ5
zKVcy)MrY8Q`)<%nIMSEO5u%Y<Ejc!yY--)uy#bgE@jCVieb0L6t<vk-l@(Mo%6y^1
zzW&_QPPXTeEhshxf+A~W@0p>0G#nH=9%Tc^+bcF-fIVG`O<5<LsUuDz6G6u@a|X2F
z^t0MZ$dX}R7bna&v-PCcxSA<pxo($9PP=|3&~%<Wa^=+KQAG=DZ#q{&b7VIB)ESGu
zhhk1v7M`-{&nW_mdqun=L~Yt3zNlk@^cHZUXaaUk^LbA_$_fG1KV<u+MoDiz_d?jg
zP4Ri}k#Cb}-P8)s$$AQ9vnV^P=WTYSXF6?4OZY&@;#7EMQl|_<4nAx@ff!~u5K85O
z-u5B0Nb_bgrZX4wa;MT4^RlOM7W0OLNTPuz`Xh-}-YgbEUr1(wr-2t5@B@Drc%c#2
zMtV;G(+;Geoz`KqSmz`jtJN&#K_3Zsp79XiX<#)M2yMXthJXhI7~XVPFbd*fYzE&6
z;23KlL;@;b)3N#H^pT*Kx0}Vz8)mTs(gJBkav}6$=)=(&AiylJ0*%|8D$nML*S!I>
z%h}bQJMit2MN!GiA*++|$iUuFn#-LRiv4O&tsT9qd-9KNI35c0dllRpA?3FI?uJ&j
zrRts6z07ul&$`ttHvAiA!KU=JLs#CUqhvjF^rOH<z;vWmyUKGG_%5*8HjI&~JZra^
z#V3*bro(>T=2UuAz%Rlda5M-=P<Y;g;S0Ag^7`SB-Ja~32Hpy+tQ`&2Vzkcz$73em
za6Ejz-X8CH5qKB;G`G&F_B`^iSxf>}Z5a-YIJTbO4P5pXI_!+`#6zEk@0R0{^Xlz4
zLLuO*@X;lxw+nHe;Rdt#3Ig6l?-RD&2n{3PtH2LClRaDQNuGp2(EApAG23p2P9V{4
z*LV`qnMCVo$co)uJ@&fSeRRG#=@yo%5lc0WrGm6ThL;L@G4#cy0%n23OC?}>ikGex
zoG&e1E8tQr0bQ$NUhVM$yT{BTT`~qs#^NZo-!gBZOV({+TXxxBs9lA49!rylcxw|j
z^MqOSV*ngZkP13mD~H_ZcnLZ#xDT!PpL{gZ+Qz1)`my%gvVDcCuaw`kk;lP3LM@J^
z9)UDQ+sf%QCbmpFl<m`C#H2|(M`I{|5Maip^%0xi#|B8RmLt79-!F3hYu*BM#^_)@
z*1$DyzMb{(vfgXq^*C|;f|wU|+Wh*kbBjgqtwTV94a7tnr=5Ur3gkiw$cT;*3$IO;
zgI*=5P5~0ig>8?YFT51cpHuB9{E%V}RA>$;LM(5%N3qe*7v*ErRM$-NMPE}$WP?un
zsoo&u(C`^HkIE8`)i%oF7PgVS7akWLKP9kVr#>>t7uG;Bw)!qlAI2{qa_KF^H|t8f
zo((%E6LhQ9bv*3rcCCO-!x7!Ei7g=((wdGrS(vXZ_#8Z&qLd4SEpn|gsQPH;XLon1
zderapMO)>(mQC(-Up~in7sv`<I@ju|^Al$c8vZy!KJYsxOvo$IaLE!vrday`A!owY
zx=6@^C{hC#VdQ0mOzqmwn~;TNjQsZ}WMLU2e*z(&dG>Q5WMLU2e-a_PIp%X9WMLU2
z|NRMBSjNbI8$!PJ%x5QLfq+yO8ToHW$XDCQXCY)^86#6d_O$7ik+I*hoQ$7obu1Dw
zhmn#A?OZkFQUYEenolR<O<^YfSR%Grh?5d9WiO44mpz*@Sbg8vlJ$)AOnMzVdqnc5
z7e6s`WWHHOL>el4d$39m%DOHST?rV3X{hh2;7Ro>L!{bgYF}fr=&N6m_0=X$U+wnm
ztN$IjYc&VoXOWAi^lb)@u0md_qrQv5Z_>ySsV-x4%mBh}Kh!Qgmr-OcqRsB8?~+aV
zt53pCsTyR}3%2W}G)(=eb@g2tGP(mB!B1H45os#J%r+g7GBaz6Y~ifKE-oy)T}>Q$
ze8A)_Z-3Mf(4zCzlbr6VbTqTMP8p3P(MZC%To>>{eF!hnO^{~#G7U*zuyf>`OC(zG
zONF1oHkUT*PaoX8FKv%C2=vE?pxY!@Spzp=xcaW!_vasDWqdpF?y<O0(h}&U!|5MD
zQy1hf5ql?pT&xD6ms(VwB2Lv)2fW39LS(qJPbs<GF1!Dv$lBF&nM1}FjU3xr&z-p3
zAfLP(+YtMKcI?Uz)mG(PDR)}Tlb;jB*nf)sK!eW@X5y+>+gh6v-X+{gt4cm$RdA{g
z2=|>v*m^bZT=^k=_^crG(NC8zqC=~>fVlp4&`Xz?gT4WuFS{T&;<H85z}-!PZk`Bw
zV;?4=KRJ@db(~+MuN3KZ_Peo`gwSw_bS$n+;AFq3;?;*+;m*a6_-`TeAPwC2y4BpU
z7=w4KV#plGI>-dbU62`&FF_h1M<5lD1IT5=_JA0RHe_G}@Z=8t_!MF^u(3PR=0?yp
zrJp`4uu4otfR;{j^zA|qeBY5io{3K&8k~(Wl3uAX-W)gc^nCNAUq3$iv~)4PEY>ai
z@fqMAtCQY{{{pjF`Y^sU=FVQiog7;&<t3z-$?{$zx52=j#qYaw6<+KZ%bmTM*dQ!t
zIjf7>h&(CnOvp&;LhHnlGRID&&OQ0GbULA!k)<Cetju_AkFn*rHE10e4qE;Ez&M#C
zeHJ|Tl5{5_jd@=BuY^_UHvXiTh+Q3PRfm*x`$R`0nG*F03xG<=me{~f^hg&IH^xjD
z$A8i@-z>eCSXgld%10xG<D@Bd&VV{wCjR;xf1Yoivh|Df%ah^wQ*n=>BRBf3`TTM5
zHK`)0Fw3v}eR?bDo38B_zlHt2oj-w(ntqXR)>BQ=k)#5~D)lE-ug7t;I4+KvIIXXQ
zJAJuSe(v(AdQIs5o!_)fI?1?tD%<DgzBQI(H@2u|PFGpDOJnazDak9@n{!fm^5g6u
z=A`G7b6EDi^l~zvT{A!b829+4F*YZ@=C@Au_%-=AEn_)*YAiPQ*texvZGlBa57xW&
zU21x$w$^u@o||u`I=L0K;h^=w2Fqe$u&2qd`n}0=Y}fClH7l$Bv6z&S$Q?jFsUn+5
z49O<xu>3_|y+k+R^5HL@kBeGG){`2NNNS|ctg5&J?c_iw*@HevFJ~1ChYxkE-MxGN
zzJsovU)k>}@7P>fX)N2*cHqf<o#AdqeFt5g2g3NT<4qfk2RbV%jYSZqO5;Nh8BK_c
z=IwuK-~P64;QuGjj09<BrC$0)*3pZ`)$C-pRBo(E$wIUqH3pSlbxcS{jcH7a^n$S}
zj^5YMdm7x&yldoD--X*Q<>l=Z9MA@E3qy+m7ck4Qf~a7>4nMUkQSGQ2rH0TO9$IIk
zmEPviVnm&UjmB!w9{$3G!G9Kj+Gz=6B^jL0Yh<(cboRFGK6qefW9Pokw!@uUc6J{;
zxUXl^rky<-H>@3ew9xx0A<`d<YlKg1Bs!~y_jc|pKWO|cp>$C{C9<dU(BZEfJb<t@
zYdQy;R&8RXv!#3U>Ae{(3KAEVbna<Ukod54=SqgM=Y{WuLFAv3d^rElP-!TgF$ST-
zkaWp*MDfod&j4GZ_?M7x1Jj!_8b1m7J76}7--nz9rrU!0XCbw?1x$!wfy7`9j4%{O
z1GK<)V0wK<6H0-dOZ=6<Zs15sAhi$R9l$;#9)bV{t#klS0yjnRUf@l61jmy4g(UJU
z3=SCR-l7Sof$3h11SFA*z;y4?O-=p%z;w?o#J>pK8{K#Gv9AK3S;Q&ir@$AYm|<6t
z3E+jI6`)NnH30u%3DyD6J%IoF0j!{N9W69I?g6~w0labv7H9&!&Rno*u?N;YfNO!T
zA+8TK1#{2<?8f=t2dpD8q>&~^uS)6d?JnSwDAocW298|YMs9b{0Z$qTsV4Y3rwQML
zAq!+dHwncr0WZA{eib-;QLK*me?}X?{<ekV<H#GpvN4jMf+hMnurWV!%u`4*xer_d
zydCir8*uO12TaJ9v0|*26u{sP8z@d86~Mn-z?hMZzzc=W*!~Cbu_c(Mos;gY8ORQw
z^Ns~B4f8WuIISJh-SR5wa(S`jEnlII@a&*rZ6hO%m~>LBY2DzDO)Hs;j6vcUrF5&p
z2MRq}nX8(>6ii6o%4gXLy_B;qM@{De)XT5SkEQ39)*aIRb-8MKNZh%D7uWF&^Ro0G
z>kHUpCDKRhi`n(1QvQZwPE&?`2_mHX94eulNDs{o)Fs0q-zX>K2TKeKyYK%1@yBem


From 63a3fd2fe991eb6b4cddcfd069d48c9c8ae333f7 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Mon, 24 Mar 2025 12:48:25 +0800
Subject: [PATCH 130/200] ggml-qnn: refine the entire ggml-qnn.cpp to make code
 more clear

---
 ggml/src/ggml-qnn/ggml-qnn.cpp               | 208 ++++++++++---------
 ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c   |   8 +-
 ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h   |   8 +-
 ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c      |  12 +-
 ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c |   8 +-
 ggml/src/ggml-qnn/kernels/libggmlop_skel.so  | Bin 13704 -> 13736 bytes
 6 files changed, 123 insertions(+), 121 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 70bdc625fe37b..3357251f290e0 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -5,15 +5,20 @@
  * https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
  * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
  *
- * this single-source-file or self-contained implementation of ggml-qnn backend has 10 sections:
+ * there are three tech approaches to implement the ggml-hexagon backend for Qualcomm's Hexagon NPU:
+ * - general approach through Qualcomm QNN SDK:offload ggml op to QNN, then QNN will transfer to Hexagon cDSP
+ * - general approach through Qualcomm Hexagon SDK:offload ggml op to Hexagon cDSP directly
+ * - special approach through Qualcomm QNN SDK:mapping the entire ggml cgraph to a single QNN graph
+ *
+ * this single-source-file or self-contained implementation of ggml-hexagon backend has 10 sections:
  * section-1  forward/prototype declaration, global vars, macros, data structures
  * section-2  ggml-qnn internal troubleshooting function/class
  * section-3  helper function for WoA(Windows on ARM)
  * section-4  general helper function
  * section-5  QNN helper function
  * section-6  Hexagon DSP helper function
- * section-7  ggml-qnn backend helper function / class
- * section-8  implementation of ggml-qnn backend according to ggml's backend subsystem
+ * section-7  backend helper function / class
+ * section-8  implementation of ggml-hexagon backend according to specification in ggml backend subsystem
  * section-9  implementation of general approach through QNN and Hexagon DSP
  * section-10 implementation of special approach through QNN:mapping the entire ggml cgraph to a single QNN graph
  *
@@ -131,6 +136,8 @@ class  qnn_instance;
 struct qnn_parameter;
 struct ggml_backend_qnn_context;
 
+typedef int  (*pfn_mallopt)(int, int);
+typedef int  (*pfn_android_mallopt)(int, void *, size_t);
 typedef void (* ggmlqnn_op_func_t)(ggml_backend_qnn_context * ctx, ggml_tensor * op);
 typedef int  (* notify_callback_fn)(void * context, int domain, int session, remote_rpc_status_flags_t status);
 typedef int  (* ggmlhexagon_op_func_t)(remote_handle64 handle, const dsptensor * src0, const dsptensor * src1, dsptensor * dst);
@@ -276,11 +283,11 @@ using _pfn_QnnSaver_initialize                  = decltype(QnnSaver_initialize);
 using _pfn_QnnInterface_getProviders            = decltype(QnnInterface_getProviders);
 using _pfn_QnnSystemInterface_getProviders      = decltype(QnnSystemInterface_getProviders);
 
-//QNN resource management for the first technical approach(general approach in ggml-sycl or ggml-cann)
+//QNN resource management for the general approach through QNN(similar to ggml-sycl or ggml-cann)
 using qnn_ptensors_t                            = std::vector< Qnn_Tensor_t *>;
 using qnn_singlenode_res_t                      = std::tuple<Qnn_GraphHandle_t, qnn_ptensors_t>;
 
-//QNN resource management for the second technical approach(mapping the entire cgraph to a single QNN graph)
+//QNN resource management for the special approach through QNN(mapping the entire cgraph to a single QNN graph)
 using qnn_tensors_t                             = std::vector< Qnn_Tensor_t >;
 using qnn_tensor_pair_t                         = std::tuple< ggml_tensor *, Qnn_Tensor_t *>;
 using qnn_tensor_pairs_t                        = std::vector< qnn_tensor_pair_t >;
@@ -360,17 +367,19 @@ struct ggml_backend_qnn_context {
     QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
     struct qcom_socinfo           socinfo;
 
-    //QNN resource management for the first technical approach(general approach in ggml-sycl or ggml-cann)
+    //QNN resource management for the general approach through QNN(similar to ggml-sycl or ggml-cann)
     std::map<std::string, qnn_singlenode_res_t> qnn_singlenode_graph_map;
-    //QNN resource management for the second technical approach(mapping the entire cgraph to a single QNN graph)
+    //QNN resource management for the special approach through QNN(mapping the entire cgraph to a single QNN graph)
     std::map<std::string, qnn_multinode_res_t> qnn_multinode_graph_map;
 
+    //quantize data -> fp32
     std::unique_ptr<char[]> work_data;
     std::vector<std::future<void>> tasks;
     size_t work_size;
     size_t desired_size;
     int n_threads;
 
+    //hexagon resource management for the general approach through Hexagaon cDSP
     size_t rpc_mempool_len;
     void * rpc_mempool;
     remote_handle64 ggmlop_handle;
@@ -381,7 +390,6 @@ struct qnn_op_caps {
     ggml_op op;
     const char * qnn_op_name;
     const size_t input_param_count;
-    const char * qnn_param_name;
 };
 
 struct qnn_parameter {
@@ -562,7 +570,7 @@ static domain hexagon_supported_domains[] = {
 };
 
 static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = {
-        {true,  GGML_OP_NONE, nullptr, 0, nullptr},
+        {true,  GGML_OP_NONE, nullptr, 0},
         {false, GGML_OP_DUP},
         {true,  GGML_OP_ADD, QNN_OP_ELEMENT_WISE_ADD, 2},
         {false, GGML_OP_ADD1},
@@ -985,12 +993,10 @@ static void ggmlqnn_get_timestring(char * p_currenttime) {
 }
 
 //fix some tricky memory issue
-typedef int (*pfn_mallopt)(int, int);
-typedef int (*pfn_android_mallopt)(int, void *, size_t);
 static void ggmlqnn_disable_android_tags(int disable) {
     if (0 == disable)
         return;
-
+#if defined(__ANDROID__)
     void * lib_handle = dlopen("libc.so", RTLD_LAZY);
     if (nullptr != lib_handle) {
         int api_level = android_get_device_api_level();
@@ -1013,6 +1019,7 @@ static void ggmlqnn_disable_android_tags(int disable) {
         }
         dlclose(lib_handle);
     }
+#endif
 }
 
 // =================================================================================================
@@ -1338,32 +1345,6 @@ static const char * ggmlqnn_get_qnnerror_string(Qnn_ErrorHandle_t qnn_error_code
     }
 }
 
-static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
-                                       Qnn_Param_t * params, uint32_t num_params,
-                                       Qnn_Tensor_t * inputs, uint32_t num_inputs,
-                                       Qnn_Tensor_t * outputs, uint32_t num_outputs) {
-
-    char opcfg_name[GGML_MAX_NAME] = {};
-
-    //ensure the opcfg name is unique
-    if (nullptr == name) {
-        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%-8d", ggmlqnn_get_idx(QNN_OPCFG_INDEX));
-    } else {
-        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%s_%-8d", name, ggmlqnn_get_idx(QNN_OPCFG_INDEX));
-    }
-    GGMLQNN_LOG_DEBUG("create qnn opconfig %s", opcfg_name);
-    ggmlqnn_inc_idx(QNN_OPCFG_INDEX);
-
-    Qnn_OpConfigV1_t v1 = {opcfg_name, package, type,
-                           num_params, params,
-                           num_inputs, inputs,
-                           num_outputs, outputs
-    };
-    Qnn_OpConfig_t opcfg = {QNN_OPCONFIG_VERSION_1, {v1}};
-
-    return opcfg;
-}
-
 // =================================================================================================
 //  section-6: Hexagon DSP helper function
 // =================================================================================================
@@ -2050,7 +2031,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
     ggmlop_domain_uri       = (char *)malloc(ggmlop_domain_uri_len);
     snprintf(ggmlop_domain_uri, ggmlop_domain_uri_len, "%s%s", ggmlop_URI, uri);
     GGMLQNN_LOG_INFO("ggmlop domain uri:%s\n", ggmlop_domain_uri);
-    hexagon_error = ggmlop_open(ggmlop_domain_uri, &ctx->ggmlop_handle);
+    hexagon_error = ggmlop_dsp_open(ggmlop_domain_uri, &ctx->ggmlop_handle);
     if (AEE_SUCCESS == hexagon_error) {
         GGMLQNN_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
         GGMLQNN_LOG_INFO("only support GGML_OP_ADD on cDSP currently\n");
@@ -2082,7 +2063,7 @@ static void ggmlhexagon_close_cdsp(ggml_backend_qnn_context * ctx) {
     int hexagon_error  = AEE_SUCCESS;
     GGMLQNN_LOG_DEBUG("enter %s", __func__);
     if (-1 != ctx->ggmlop_handle) {
-        hexagon_error = ggmlop_close(ctx->ggmlop_handle);
+        hexagon_error = ggmlop_dsp_close(ctx->ggmlop_handle);
         if (AEE_SUCCESS != hexagon_error) {
             GGMLQNN_LOG_WARN("error 0x%x: failed to close ggmlop handle", hexagon_error);
         } else {
@@ -2109,18 +2090,18 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
     void * wdata                    = nullptr;
 
     ggml_tensor * src0              = op->src[0];
-    //TODO: src1 might-be nullptr
+    //src1 might-be nullptr for some ggml op
     ggml_tensor * src1              = op->src[1];
     ggml_tensor * dst               = op;
     ggml_type src0_type = src0->type;
 
     switch (op->op) {
         case GGML_OP_ADD:
-            op_func = ggmlop_add;
+            op_func = ggmlop_dsp_add;
             break;
         case GGML_OP_MUL_MAT: {
             wdata = ggmlqnn_type_trait(ctx, op);
-            op_func = ggmlop_mulmat;
+            op_func = ggmlop_dsp_mulmat;
             break;
         }
         default:
@@ -2128,11 +2109,11 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
     }
 
     if ((GGML_OP_MUL_MAT == op->op) && (src0_type != GGML_TYPE_F32)) {
-        dsptensor_0.data = wdata;
-        dsptensor_0.data_len = ctx->desired_size;
+        dsptensor_0.data        = wdata;
+        dsptensor_0.data_len    = ctx->desired_size;
     } else {
-        dsptensor_0.data = src0->data;
-        dsptensor_0.data_len= ggml_nbytes(src0);
+        dsptensor_0.data        = src0->data;
+        dsptensor_0.data_len    = ggml_nbytes(src0);
     }
 
     dsptensor_1.data = src1->data;
@@ -2183,7 +2164,7 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
 }
 
 // =================================================================================================
-//  section-7:ggml-qnn backend helper function / class
+//  section-7: backend helper function / class
 // =================================================================================================
 static const char * ggmlqnn_get_socmodel_desc(uint32_t soc_model) {
     switch (soc_model) {
@@ -3510,7 +3491,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         }
     }
 
-    auto qnnstatus = QNN_SUCCESS;
+    Qnn_ErrorHandle_t qnnstatus = QNN_SUCCESS;
     if (_device_id == QNN_BACKEND_NPU) {
         //TODO: remove duplicated code between here and function htp_print_info
         const QnnDevice_PlatformInfo_t * p_info = nullptr;
@@ -3587,7 +3568,6 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
     }
 
 #if defined(__ANDROID__) || defined(__linux__)
-    //_rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
     std::filesystem::path full_path(std::string(g_qnn_params.qnn_runtimelib_path) + "libcdsprpc.so");
     full_path /= std::filesystem::path("libcdsprpc.so").filename();
     _rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
@@ -4182,51 +4162,30 @@ static uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_t
     return qnn_rpcbuffer;
 }
 
-static void ggmlqnn_load_cfg() {
-    //this function can be called in various scenarios
-    static bool initialized = false;
-    if (initialized) {
-        GGMLQNN_LOG_INFO("qnn cfg file already loadded\n");
-        return;
-    }
-    char time_string[GGML_QNN_TMPBUF_LEN];
-    memset(time_string, 0, GGML_QNN_TMPBUF_LEN);
-    ggmlqnn_get_timestring(time_string);
-    GGMLQNN_LOG_DEBUG("program running start time:%s", time_string);
-    ggmlqnn_disable_android_tags(1);
+static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
+                                               Qnn_Param_t * params, uint32_t num_params,
+                                               Qnn_Tensor_t * inputs, uint32_t num_inputs,
+                                               Qnn_Tensor_t * outputs, uint32_t num_outputs) {
 
-    std::string cfg_filename = std::string(g_qnn_params.qnn_runtimelib_path) + std::string(g_qnn_params.qnn_cfgfilename);
-    GGMLQNN_LOG_INFO("load ggml-qnn config from %s", cfg_filename.c_str());
-    qnn_cfg qnncfg_instance;
-    qnncfg_instance.load(cfg_filename);
-    qnncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
-        std::ostringstream  tmposs;
-        tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]" << std::endl;
-        GGMLQNN_LOG_INFO("%s", tmposs.str().c_str());
-    });
-    std::string precision_mode;
-    qnncfg_instance.get_intvalue("general", "print_qnn_internal_log", g_qnn_params.print_qnn_internal_log, 0);
-    qnncfg_instance.get_intvalue("general", "enable_perf", g_qnn_params.enable_perf, 0);
-    qnncfg_instance.get_intvalue("general", "print_tensors_info", g_qnn_params.print_tensors_info, 0);
-    qnncfg_instance.get_intvalue("general", "dump_op_info", g_qnn_params.dump_op_info, 0);
-    qnncfg_instance.get_intvalue("general", "inference_approach", g_qnn_params.inference_approach, 0);
-    qnncfg_instance.get_intvalue("general", "qnn_backend", g_qnn_params.qnn_backend, 2);
-    qnncfg_instance.get_intvalue("npu", "hvx_threads", g_qnn_params.hvx_threads, 4);
-    qnncfg_instance.get_intvalue("npu", "vtcm_size_in_mb", g_qnn_params.vtcm_size_in_mb, 8);
-    qnncfg_instance.get_intvalue("npu", "enable_dlbc", g_qnn_params.enable_dlbc, 0);
-    qnncfg_instance.get_stringvalue("npu", "precision_mode", precision_mode, "fp32");
-    GGMLQNN_LOG_INFO("print_qnn_internal_log=%d", g_qnn_params.print_qnn_internal_log);
-    GGMLQNN_LOG_INFO("inference_approach=%d(%s)", g_qnn_params.inference_approach,
-                     ggmlqnn_get_inference_approach_name(g_qnn_params.inference_approach));
-    GGMLQNN_LOG_INFO("qnn_backend=%d", g_qnn_params.qnn_backend);
-    GGMLQNN_LOG_INFO("npu inference precision mode=%s", precision_mode.c_str());
-    GGMLQNN_LOG_INFO("qnn runtime lib path=%s", g_qnn_params.qnn_runtimelib_path);
-    if (precision_mode.find("fp16") != std::string::npos) {
-        g_qnn_params.precision_mode = 1;
+    char opcfg_name[GGML_MAX_NAME] = {};
+
+    //ensure the opcfg name is unique
+    if (nullptr == name) {
+        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%-8d", ggmlqnn_get_idx(QNN_OPCFG_INDEX));
     } else {
-        g_qnn_params.precision_mode = 0;
+        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%s_%-8d", name, ggmlqnn_get_idx(QNN_OPCFG_INDEX));
     }
-    initialized = true;
+    GGMLQNN_LOG_DEBUG("create qnn opconfig %s", opcfg_name);
+    ggmlqnn_inc_idx(QNN_OPCFG_INDEX);
+
+    Qnn_OpConfigV1_t v1 = {opcfg_name, package, type,
+                           num_params, params,
+                           num_inputs, inputs,
+                           num_outputs, outputs
+    };
+    Qnn_OpConfig_t opcfg = {QNN_OPCONFIG_VERSION_1, {v1}};
+
+    return opcfg;
 }
 
 static Qnn_Tensor_t * ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
@@ -4349,8 +4308,55 @@ static Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn
     return p_qnn_tensor;
 }
 
+static void ggmlqnn_load_cfg() {
+    //this function can be called in various scenarios
+    static bool initialized = false;
+    if (initialized) {
+        GGMLQNN_LOG_INFO("qnn cfg file already loadded\n");
+        return;
+    }
+    char time_string[GGML_QNN_TMPBUF_LEN];
+    memset(time_string, 0, GGML_QNN_TMPBUF_LEN);
+    ggmlqnn_get_timestring(time_string);
+    GGMLQNN_LOG_DEBUG("program running start time:%s", time_string);
+    ggmlqnn_disable_android_tags(1);
+
+    std::string cfg_filename = std::string(g_qnn_params.qnn_runtimelib_path) + std::string(g_qnn_params.qnn_cfgfilename);
+    GGMLQNN_LOG_INFO("load ggml-qnn config from %s", cfg_filename.c_str());
+    qnn_cfg qnncfg_instance;
+    qnncfg_instance.load(cfg_filename);
+    qnncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
+        std::ostringstream  tmposs;
+        tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]" << std::endl;
+        GGMLQNN_LOG_INFO("%s", tmposs.str().c_str());
+    });
+    std::string precision_mode;
+    qnncfg_instance.get_intvalue("general", "print_qnn_internal_log", g_qnn_params.print_qnn_internal_log, 0);
+    qnncfg_instance.get_intvalue("general", "enable_perf", g_qnn_params.enable_perf, 0);
+    qnncfg_instance.get_intvalue("general", "print_tensors_info", g_qnn_params.print_tensors_info, 0);
+    qnncfg_instance.get_intvalue("general", "dump_op_info", g_qnn_params.dump_op_info, 0);
+    qnncfg_instance.get_intvalue("general", "inference_approach", g_qnn_params.inference_approach, 0);
+    qnncfg_instance.get_intvalue("general", "qnn_backend", g_qnn_params.qnn_backend, 2);
+    qnncfg_instance.get_intvalue("npu", "hvx_threads", g_qnn_params.hvx_threads, 4);
+    qnncfg_instance.get_intvalue("npu", "vtcm_size_in_mb", g_qnn_params.vtcm_size_in_mb, 8);
+    qnncfg_instance.get_intvalue("npu", "enable_dlbc", g_qnn_params.enable_dlbc, 0);
+    qnncfg_instance.get_stringvalue("npu", "precision_mode", precision_mode, "fp32");
+    GGMLQNN_LOG_INFO("print_qnn_internal_log=%d", g_qnn_params.print_qnn_internal_log);
+    GGMLQNN_LOG_INFO("inference_approach=%d(%s)", g_qnn_params.inference_approach,
+                     ggmlqnn_get_inference_approach_name(g_qnn_params.inference_approach));
+    GGMLQNN_LOG_INFO("qnn_backend=%d", g_qnn_params.qnn_backend);
+    GGMLQNN_LOG_INFO("npu inference precision mode=%s", precision_mode.c_str());
+    GGMLQNN_LOG_INFO("qnn runtime lib path=%s", g_qnn_params.qnn_runtimelib_path);
+    if (precision_mode.find("fp16") != std::string::npos) {
+        g_qnn_params.precision_mode = 1;
+    } else {
+        g_qnn_params.precision_mode = 0;
+    }
+    initialized = true;
+}
+
 // =================================================================================================
-//  section-8: implementation of ggml-qnn backend
+//  section-8: implementation of ggml-hexagon backend according to ggml backend subsystem
 // =================================================================================================
 static bool ggmlqnn_same_types(const ggml_backend_qnn_context * ctx, const ggml_tensor * op_tensor) {
     GGML_UNUSED(ctx);
@@ -4375,7 +4381,6 @@ static bool ggmlqnn_same_types(const ggml_backend_qnn_context * ctx, const ggml_
 static bool ggmlhexagon_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * op_tensor) {
     struct ggml_tensor * src0 = op_tensor->src[0];
     struct ggml_tensor * src1 = op_tensor->src[1];
-
     const int64_t ne00  = op_tensor->src[0]->ne[0];
     uint32_t src0_rank  = ggml_n_dims(src0);
     uint32_t src1_rank  = 0;
@@ -4387,14 +4392,11 @@ static bool ggmlhexagon_can_handle_op(const ggml_backend_qnn_context * ctx, cons
     if (op_tensor->op != GGML_OP_ADD)
         return false;
 
-    //ggmlqnn_dump_op_info(op_tensor);
+    ggmlqnn_dump_op_info(op_tensor);
     if (!ggml_are_same_shape(src0, src1)) {
         return false;
     }
 
-    if (ne00 < 32)
-        return false;
-
     return ggmlqnn_same_types(ctx, op_tensor);
 }
 
@@ -5057,7 +5059,7 @@ struct ggml_backend_qnn_reg_context {
 
 static const char * ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) {
     GGML_UNUSED(reg);
-    return "ggml-qnn";
+    return "ggml-hexagon";
 }
 
 static size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) {
@@ -5264,7 +5266,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
 GGML_BACKEND_DL_IMPL(ggml_backend_qnn_reg)
 
 // =================================================================================================
-//  section-9: general approach: offload GGML op to QNN backend or offload GGML op to Hexagon DSP directly
+//  section-9: general approach: offload GGML op to QNN backend or offload GGML op to Hexagon cDSP directly
 // =================================================================================================
 static inline uint32_t ggmlqnn_get_tensor_data_size(const ggml_tensor * tensor) {
     /*
@@ -5296,8 +5298,8 @@ static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const
 }
 
 /*
- * provide a general skeleton to offload ggml op to QNN backend or Hexagon cDSP: peform element-wise operation on 1/2
- * input tensors and 1 output tensors
+ * provide a general skeleton to offload ggml op to QNN backend or Hexagon cDSP: perform element-wise
+ * operation on 1/2 input tensors and 1 output tensors
 */
 static void ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
     Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
@@ -5675,7 +5677,7 @@ static void ggmlqnn_compute_mul_mat_4d(ggml_backend_qnn_context * ctx, ggml_tens
              operation when offloading mulmat to QNN backend. this implementation will handle transpose
              in func ggmlqnn_compute_create_general_tensor()
 
- * @param ctx     the context of ggml-qnn backend
+ * @param ctx     the context of backend
  * @param op      the destination tensor where the result of the matrix multiplication will be stored.
  *
  * @note the logic of ggmlqnn_compute_mul_mat is similar to ggmlqnn_compute_op_two_tensors but much more complicated
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c
index 1e1ce6488d25e..6f2c37e4087cc 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c
@@ -288,10 +288,10 @@ __QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {4,&(methodArrays[
 #ifdef __cplusplus
 extern "C" {
 #endif
-__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_open)(const char* uri, remote_handle64* h) __QAIC_STUB_ATTRIBUTE {
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_open)(const char* uri, remote_handle64* h) __QAIC_STUB_ATTRIBUTE {
    return __QAIC_REMOTE(remote_handle64_open)(uri, h);
 }
-__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_close)(remote_handle64 h) __QAIC_STUB_ATTRIBUTE {
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_close)(remote_handle64 h) __QAIC_STUB_ATTRIBUTE {
    return __QAIC_REMOTE(remote_handle64_close)(h);
 }
 static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint64_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
@@ -419,11 +419,11 @@ static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint64_
    _allocator_deinit(_al);
    return _nErr;
 }
-__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_add)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_add)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
    uint32_t _mid = 2;
    return _stub_method(_handle, _mid, (uint64_t*)src0, (uint64_t*)src1, (uint64_t*)dst);
 }
-__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_mulmat)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_mulmat)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
    uint32_t _mid = 3;
    return _stub_method(_handle, _mid, (uint64_t*)src0, (uint64_t*)src1, (uint64_t*)dst);
 }
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h
index 0301f8f78f8d2..1273cb76b1797 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h
@@ -268,7 +268,7 @@ struct dsptensor {
     * @param h, resulting handle
     * @retval, 0 on success
     */
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_open)(const char* uri, remote_handle64* h) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_open)(const char* uri, remote_handle64* h) __QAIC_HEADER_ATTRIBUTE;
 /**
     * Closes a handle.  If this is the last handle to close, the session
     * is closed as well, releasing all the allocated resources.
@@ -276,9 +276,9 @@ __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_open)(const char* uri, remote_hand
     * @param h, the handle to close
     * @retval, 0 on success, should always succeed
     */
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_close)(remote_handle64 h) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_add)(remote_handle64 _h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_mulmat)(remote_handle64 _h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_close)(remote_handle64 h) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_add)(remote_handle64 _h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_mulmat)(remote_handle64 _h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) __QAIC_HEADER_ATTRIBUTE;
 #ifndef ggmlop_URI
 #define ggmlop_URI "file:///libggmlop_skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1"
 #endif /*ggmlop_URI*/
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
index bddafa29ea81e..b5cfd8810cfe6 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
@@ -9,7 +9,7 @@
 
 #define ggml_tensor dsptensor
 
-int ggmlop_open(const char*uri, remote_handle64* handle) {
+int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
     void *tptr = NULL;
     FARF(HIGH, "uri %s", uri);
     tptr = (void *)malloc(1);
@@ -18,7 +18,7 @@ int ggmlop_open(const char*uri, remote_handle64* handle) {
     return 0;
 }
 
-int ggmlop_close(remote_handle64 handle) {
+int ggmlop_dsp_close(remote_handle64 handle) {
     if (handle)
         free((void*)handle);
     return 0;
@@ -279,9 +279,9 @@ static void ggml_compute_forward_add_f32(
     }
 }
 
-int ggmlop_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst)
+int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst)
 {
-    FARF(HIGH, "===============     DSP: ggmlop_add ");
+    FARF(HIGH, "===============     DSP: ggmlop_dsp_add ");
     switch (src0->type) {
         case GGML_TYPE_F32:
         {
@@ -349,8 +349,8 @@ int ggmlop_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor *
 }
 
 
-int ggmlop_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    FARF(HIGH, "===============     DSP: ggmlop_mulmat ");
+int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    FARF(HIGH, "===============     DSP: ggmlop_dsp_mulmat ");
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c
index 33d47174bf5ef..9d6b64fd6b570 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c
@@ -573,13 +573,13 @@ static __inline int _skel_method_2(int (*_pfn)(const char*, remote_handle64*), u
 __QAIC_SKEL_EXPORT int __QAIC_SKEL(ggmlop_skel_handle_invoke)(remote_handle64 _h, uint32_t _sc, remote_arg* _pra) __QAIC_SKEL_ATTRIBUTE {
    switch(REMOTE_SCALARS_METHOD(_sc)){
       case 0:
-      return _skel_method_2(__QAIC_IMPL(ggmlop_open), _sc, _pra);
+      return _skel_method_2(__QAIC_IMPL(ggmlop_dsp_open), _sc, _pra);
       case 1:
-      return _skel_method_1(__QAIC_IMPL(ggmlop_close), _sc, _pra);
+      return _skel_method_1(__QAIC_IMPL(ggmlop_dsp_close), _sc, _pra);
       case 2:
-      return _skel_method(__QAIC_IMPL(ggmlop_add), _h, _sc, _pra);
+      return _skel_method(__QAIC_IMPL(ggmlop_dsp_add), _h, _sc, _pra);
       case 3:
-      return _skel_method(__QAIC_IMPL(ggmlop_mulmat), _h, _sc, _pra);
+      return _skel_method(__QAIC_IMPL(ggmlop_dsp_mulmat), _h, _sc, _pra);
    }
    return AEE_EUNSUPPORTED;
 }
diff --git a/ggml/src/ggml-qnn/kernels/libggmlop_skel.so b/ggml/src/ggml-qnn/kernels/libggmlop_skel.so
index 8dcdba3c2fd271b8a8ec61bc77f7b443bf6a9c1c..8695cbf36949a45dfd7ce3d7cec38ad6f8fea6c4 100755
GIT binary patch
delta 2409
zcmb7_e@t6d6vxjk{XzNJ@<S*c@@Tiv5Pr7cSR~k~ks;1t)kLGFutC^Z86a-5WU&}y
zg3DZBcZ<uy64?G>8lzGYNtS3;f>W7cY>Nz|ng3w?$AC^_LL7ciUmtsU|9O)0&i$Nw
z-+TARx#x@=nmRP9i1=8k*J>^%G8cdHWVwj8r4enn66KJcb`sGH^f=T>f|_9W*ztfk
z(BG;y6VDtxLk1oJE6MTvy>ak3cpEJ9hp;{m(bKR_%BJs#qG+nZA6(J^mic$6f)~uN
z{6fAUYR3l~6a9AD0oH`aSLoOQZqPCJ?*<3mM31ALLuu43!bHOuIfY%|8Fpa6EPik<
zF~Cm!;0YU1B*8X1YWUzA27X(|JOL@Mo9cFSclSp6`oo=r{o&ptk=~BiXs|DQwChz0
z!`Bx+&~dmk(j@~aFWPy?{z%_om;9{8eY(~nucWkO>0a;a?~*2Slit<Q*(o0}4>+Cp
z9ae!|A~Hc!<rn4x7q7Yy%90P|NwH+gb#tz;kF6KT21|ypjI9S;yvte83}`MC>!v(t
zx;$cOb}2Z;n+lN^9v|Av$@ZwcW!Vslyl%}CnR3-y?c$k0W^izx6tz9eZJVsJ71XAx
zo;mg;XLFU?Ce^>dZJsq`k=q$#>qQ<@{K5pS{4{A=zA#xBwD+8eZf2kOiiQf8%m4fh
zZJL)CZO@9h{M%L@Js89~7F&9@#ZQe<d_0ukZw)+mpAYc(pAEch;3Weu7<hh*n?s2Y
z#<v`yZvMfFfo~f4djnsUtEmrK&tq}o%UGKIVVhWpJ>i%(TTAdY5scl-vzf&nX?5)s
zPst|NU}>nOCtl;JIQjbKOBAPQQILk*&|sV#8F(fC(b$qJEX3E+?|vnf0}a7+Sia}3
z$<hTT5yh^#)6Fs{%(WL0%^cA0<H3P31FHoP2MOPdOjy2B96VF9UMay3o>$78Yse{l
zq~lNyq!Ib9@|c~2h=XS|c3b&Cm=pwHQJn)J@qblrcuHe`d5SEi&Ow>!+s>xir?A=U
ztAvRS`5MK(DV`Q2B{4sB#wzoi)ZePChNY6sUMa{0EW9$7l*HA-Dz-RX?I@NHl%I;S
z+XaW9bee{p0`o>OUxS?m^SxpI0rn1<W1RUe>_58xdtlxso)GuvVAFZa_9a;XkAgXe
zalcO>i7vyzGiFCM2CRX3_n8w(5D8zE3~EI`>G);v$BH)LEIJ6j1AY#}1kQZ`+)%2f
z5#~3+ZQ!<K|4D%)KA{_M4g+R&d<nb=)?T;_9>UdD#X+BfN4D5WbKub=3sWwwfHmP=
zT{ZCCpw7YDx?x}wZ$(nL-)>-s9H=OXmKwZ1u;y~m`sOYJw}SQ6G6d!uq1B2Pz*;<U
zZHa*O)$|y+9f#J48KFN)@4%rim*>IyVssg-FV|PV`r>s9tgmiAg5!!>ew{eDU%~on
znuX--^JwLGe>PRXaU2d7hsi;^z{6YYq-OAKjj<zp!J6>CzGdJuI_B3d$zLjO@D-h|
z^70kEQFT`MwM<01GM6ZUdIyT!eT<!`c}Q@Ak_)x%NH>-K5({7SDfx`wYZ<X{vN?tp
zr=T@Ci54Hrg#5<uwW@hW{^9pn_-qvE3V1ENX+gO=;I;EbX6=!^ff8%tmK4WM2YQ5<
dm;csz#YeKVzDx|umin?nsv`Paw?ZxG{sUg<hn)Zb

delta 2402
zcmb7_YfM~K5Xa~2lLcCKc?b&#`)JB$?E{JgQDWA#X{Ct`q4k5Q?E*rfyb-II&@73X
z5VQ?+VnS;hA!=-*iO4p#A8NAFM$!17wlS1wLzDWU2`0s+#MqenpS^eK^?q}bvopV$
zxp&V!XU>_kog<xNJ~cV+4?1oY6WuEQe0NyI&JocNhKCew65B<DC}oq2YM?`45jS`z
zba^__28R~kmyBpX*--mUJT9z1FYo&ZI02r37M>=GQ3Q@!-Ek2*6d>9I?V&7MgieY4
z!Oavv3;%=RL%t1K@;4|Lx=8#n1G~Z129|h4Me%|nq8)&2N~fJjXhh~&e5K5=5j^Z9
z>Pp6)bjb9A9upriu>9d0e8uwkmVw^rtF13nTU&QmZ(sDKuHJ!Gibh-DZg1*o>1s_y
zx{q{qH@(7P>kI3fT3S-x-o94Otnl-v);hM^YPsIl=aI>$!~(;`S8Vy-99RXqTmIdn
z+gYsaIVyd8rA*n_Z_lsD)B`DH*A09rWniqw$i+y*$irC2Z`pTw|H49D^AQE%ks4Xh
z?RNg%eqRyabmXcmUUh8s>eD-itqE8qo9KerY5cS^|A1Z*m&Kz`aZYTDmifKd@+pvc
zv1K;8SQ2L@Y4Nuy%fgwdf{3ewV-K=MzoEf`h49_muxXZOoqN?hKav)XbwuzbiMo#B
z#PM-ToNhQQylCQUCjQaHb0(gB$nEmJUrqeW8t-aICd{unH2%)SmmhMQyzqvJ7uew{
zah$}*sjuT<*Hcb47eC~lu{lbRtzL|m<T-6>2XFLNt7rM3cfh}=t|L(%*!0e85B5`n
zVucYJEW(H+$en>l@(#xT^hTAs#0^CU(okGI1mW{V<qoa>;`fR&Y$sHRBg$5B0tGEK
zeF{Ia4<ABHX^BV`$>CTBEr&=>;RILtO4WIO-dE;X!Zjg#dKyD2Ymkrmw!5T?q<Y%p
z*L|NT&ar!|;kbi@|2z18z#qRCD714<D8MzL5-Uy(kA+J4Xvh^G4LzxvCuB|#YjSeh
zbTyU<X@9A)lsZJOO{;FmXNg#GH;A;au^bbkrDEO~J{}Xd7pQQ0W}rvFvO2=wLQjC@
z)CgaNPJm@Kg_oh{4e?d5tdUGe;_0&dSg&TC3&8DQxw0hgQ@BK@;gG3|qYMdiU|D+M
za_|yZlT6xf;(g$0pSG<E(`u49SZ;UW!{8_U+RY;TI=CKOugBk1F``LB!Wa@R82CN#
zJUI2h6nGE^TN5{Z1|C^s4_yS0>8va{^fNdmQhPT{yj*W^$klV##DAMu$<pW|`DrHh
zaQ~*#m=E4mz)dR;Donf`Z0w%9!NVx4F1#_DoK!Cl>-8kPqy=p3rv2au;<d?q7aau~
zx^Ij|;V`z#cfrQ?`vKV4tiA*ro7p9B!l$)!4>s;6i3c8b%T=(k+veh`9ztBiZQ1K4
zKODnr4i9YwUr#Ysqzar8S;tNj4;fe<dz)7_U*)+i{`enTScQ{!&wrYt4u<BZx5OW;
zh4E|Ux$1;UlWU2WD;jahJh!dXE+>3~`?m${gXm&|d=i#iJWKo)EZL7?zP&Bz&@LCw
zt_;|do5D4fLA$J$kDDrkE;+$6n)z5|sY8y-QgM8yvO}pa_|dAMdY_-EDpP%YxT>t+
O&Ss*I4WE{(u73gdVuRTL


From 2011a4e5090d362e186f4a4fb3f7f49ea9d47c19 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Mon, 24 Mar 2025 21:52:56 +0800
Subject: [PATCH 131/200] ggml-qnn: refine the entire ggml-qnn.cpp to make code
 more clear

---
 ggml/src/ggml-qnn/ggml-qnn.cpp              | 177 ++++---
 ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c     | 536 +++++++++++++-------
 ggml/src/ggml-qnn/kernels/libggmlop_skel.so | Bin 13736 -> 13672 bytes
 3 files changed, 460 insertions(+), 253 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 3357251f290e0..847191b7dbc0c 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -367,7 +367,7 @@ struct ggml_backend_qnn_context {
     QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
     struct qcom_socinfo           socinfo;
 
-    //QNN resource management for the general approach through QNN(similar to ggml-sycl or ggml-cann)
+    //QNN resource management for the general approach through QNN(similar to ggml-sycl or ggml-opencl)
     std::map<std::string, qnn_singlenode_res_t> qnn_singlenode_graph_map;
     //QNN resource management for the special approach through QNN(mapping the entire cgraph to a single QNN graph)
     std::map<std::string, qnn_multinode_res_t> qnn_multinode_graph_map;
@@ -379,7 +379,7 @@ struct ggml_backend_qnn_context {
     size_t desired_size;
     int n_threads;
 
-    //hexagon resource management for the general approach through Hexagaon cDSP
+    //hexagon resource management for the general approach through Hexagaon cDSP(similar to ggml-sycl or ggml-opencl)
     size_t rpc_mempool_len;
     void * rpc_mempool;
     remote_handle64 ggmlop_handle;
@@ -719,10 +719,14 @@ static void ggmlqnn_print_tensors_info(const char * func_name, const ggml_backen
     if (nullptr != func_name && nullptr != ctx) {
         GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
     }
-    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
-                      src0->name,
-                      src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
-                      src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
+    if (nullptr != src0) {
+        GGMLQNN_LOG_DEBUG(
+                "%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
+                src0->name,
+                src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
+                src0->ne[3],
+                src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
+    }
     if (nullptr != src1) {
         GGMLQNN_LOG_DEBUG(
                 "%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
@@ -874,7 +878,7 @@ static const char * dlerror(void) {
 // =================================================================================================
 //  section-4: general helper function
 // =================================================================================================
-//ensure every QNN tensor/opcfg name is unique
+//ensure every QNN tensor/opcfg name is unique, threadsafe is not required at the moment
 static void ggmlqnn_reset_idx() {
     g_qnntensor_idx = 0;
     g_qnnopcfg_idx = 0;
@@ -1472,7 +1476,7 @@ static int ggmlhexagon_get_domains_info(const char * domain_type, int * num_doma
             }
         }
         *domains_info = req.sys.domains;
-        *num_domains = req.sys.num_domains;
+        *num_domains  = req.sys.num_domains;
     } else {
         hexagon_err = AEE_EUNSUPPORTED;
         goto bail;
@@ -1498,9 +1502,9 @@ static int ggmlhexagon_get_dsp_support(int * domain) {
         }
 
         if (0 == dsp_capability_domain.capability) {
-            dsp_capability_domain.domain = HEXAGON_ADSP;
+            dsp_capability_domain.domain       = HEXAGON_ADSP;
             dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT;
-            dsp_capability_domain.capability = 0;
+            dsp_capability_domain.capability   = 0;
             hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
             if(dsp_capability_domain.capability) {
                 *domain = HEXAGON_ADSP;
@@ -1538,9 +1542,9 @@ static int ggmlhexagon_get_vtcm_info(int domain, uint32_t * capability, uint32_t
             * since the ADSP does not have a dedicated VTCM, we expect the output to be 0
             */
             struct remote_dsp_capability dsp_capability_vtcm_dsp;
-            dsp_capability_vtcm_dsp.domain = (uint32_t)domain;
+            dsp_capability_vtcm_dsp.domain       = (uint32_t)domain;
             dsp_capability_vtcm_dsp.attribute_ID = attr;
-            dsp_capability_vtcm_dsp.capability = (uint32_t)0;
+            dsp_capability_vtcm_dsp.capability   = (uint32_t)0;
             hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp, sizeof(struct remote_dsp_capability));
             if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
                 GGMLQNN_LOG_DEBUG("FastRPC Capability API is not supported on this device");
@@ -1607,9 +1611,9 @@ static bool ggmlhexagon_is_async_fastrpc_supported(int domain) {
             * Async fastrpc is supported only on CDSP
             */
             struct remote_dsp_capability dsp_capability_async_support;
-            dsp_capability_async_support.domain = (uint32_t)domain;
+            dsp_capability_async_support.domain       = (uint32_t)domain;
             dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT;
-            dsp_capability_async_support.capability = (uint32_t)0;
+            dsp_capability_async_support.capability   = (uint32_t)0;
             hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support, sizeof(struct remote_dsp_capability));
             if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
                 GGMLQNN_LOG_WARN("FastRPC Capability API is not supported on this device");
@@ -1643,16 +1647,17 @@ static void ggmlhexagon_set_rpc_latency(int domain, int qos, int latency) {
     if (remote_handle_control) {
         struct remote_rpc_control_latency data;
 #if 1
-        data.enable = RPC_PM_QOS;
+        data.enable  = RPC_PM_QOS;
         data.latency = 300;
 #else
         data.enable = RPC_POLL_QOS;
         data.latency = 1000;
 #endif
-        data.enable = qos;
-        data.latency = latency;
+        data.enable   = qos;
+        data.latency  = latency;
         hexagon_error = remote_handle64_control(DSPRPC_GET_DSP_INFO, DSPRPC_CONTROL_LATENCY, (void*)&data, sizeof(data));
-        if (hexagon_error != AEE_SUCCESS){
+        if (hexagon_error != AEE_SUCCESS) {
+            //FIXME: why set rpc latency failure
             GGMLQNN_LOG_WARN("failed with error 0x%x", hexagon_error);
             goto bail;
         } else {
@@ -1676,9 +1681,9 @@ static bool ggmlhexagon_is_status_notification_supported(int domain) {
         * DSP User PD status notification Support
         */
         struct remote_dsp_capability dsp_capability_status_notification_support;
-        dsp_capability_status_notification_support.domain = (uint32_t)domain;
+        dsp_capability_status_notification_support.domain       = (uint32_t)domain;
         dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT;
-        dsp_capability_status_notification_support.capability = (uint32_t)0;
+        dsp_capability_status_notification_support.capability   = (uint32_t)0;
         hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support, sizeof(struct remote_dsp_capability));
         if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
             GGMLQNN_LOG_WARN("FastRPC Capability API is not supported on this device");
@@ -1718,9 +1723,9 @@ static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t * capability, u
             * HMX is supported on CDSP only
             */
             struct remote_dsp_capability dsp_capability_hmx_dsp;
-            dsp_capability_hmx_dsp.domain = (uint32_t)domain;
+            dsp_capability_hmx_dsp.domain       = (uint32_t)domain;
             dsp_capability_hmx_dsp.attribute_ID = attr;
-            dsp_capability_hmx_dsp.capability = (uint32_t)0;
+            dsp_capability_hmx_dsp.capability   = (uint32_t)0;
             hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp, sizeof(struct remote_dsp_capability));
             if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
                 GGMLQNN_LOG_DEBUG("FastRPC Capability API is not supported on this device");
@@ -1755,9 +1760,9 @@ static int ggmlhexagon_get_hex_arch_ver(int domain, uint32_t * capability) {
         * Query the Hexagon processor architecture version information
         */
         struct remote_dsp_capability dsp_capability_arch_ver;
-        dsp_capability_arch_ver.domain = (uint32_t)domain;
+        dsp_capability_arch_ver.domain       = (uint32_t)domain;
         dsp_capability_arch_ver.attribute_ID = ARCH_VER;
-        dsp_capability_arch_ver.capability = (uint32_t)0;
+        dsp_capability_arch_ver.capability   = (uint32_t)0;
         hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_arch_ver, sizeof(struct remote_dsp_capability));
         if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
             GGMLQNN_LOG_DEBUG("FastRPC Capability API is not supported on this device");
@@ -1774,7 +1779,7 @@ static int ggmlhexagon_get_hex_arch_ver(int domain, uint32_t * capability) {
         GGMLQNN_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
     }
 
-    bail:
+bail:
     return hexagon_error;
 }
 
@@ -1801,9 +1806,9 @@ static int ggmlhexagon_get_hvx_support_info(int domain, uint32_t * capability, u
             * HVX is supported on CDSP only
             */
             struct remote_dsp_capability dsp_capability_hvx_dsp;
-            dsp_capability_hvx_dsp.domain = (uint32_t)domain;
+            dsp_capability_hvx_dsp.domain       = (uint32_t)domain;
             dsp_capability_hvx_dsp.attribute_ID = attr;
-            dsp_capability_hvx_dsp.capability = (uint32_t)0;
+            dsp_capability_hvx_dsp.capability   = (uint32_t)0;
             hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp, sizeof(struct remote_dsp_capability));
             if ((hexagon_error & 0xFF)==(AEE_EUNSUPPORTEDAPI & 0xFF)) {
                 GGMLQNN_LOG_DEBUG("FastRPC Capability API is not supported on this device");
@@ -1834,8 +1839,8 @@ static int ggmlhexagon_request_status_notifications(int domain_id, void * contex
     struct remote_rpc_notif_register notif;
     bool status_notification_support;
 
-    notif.context = context;
-    notif.domain = domain_id;
+    notif.context     = context;
+    notif.domain      = domain_id;
     notif.notifier_fn = call_back_fn;
 
     status_notification_support = ggmlhexagon_is_status_notification_supported(domain_id);
@@ -1851,7 +1856,7 @@ static int ggmlhexagon_request_status_notifications(int domain_id, void * contex
     return hexagon_error;
 }
 
-//TODO:not work on cDSP currently
+//TODO:not work on cDSP currently, this function will affect the performance of cDSP
 static AEEResult ggmlhexagon_set_clocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled) {
 #if 0
     GGMLQNN_LOG_DEBUG("----------- entering power set clocks");
@@ -1936,8 +1941,8 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
         return 2;
     }
 
-    if (domain_id == -1) {
-        if (domain_type != NULL) {
+    if (-1 == domain_id) {
+        if (NULL != domain_type) {
             if ((strcmp(domain_type, "NSP") != 0 && strcmp(domain_type, "HPASS") != 0)) {
                 GGMLQNN_LOG_WARN("invalid domain_type %s. possible values are NSP or HPASS", domain_type);
                 goto bail;
@@ -2065,7 +2070,7 @@ static void ggmlhexagon_close_cdsp(ggml_backend_qnn_context * ctx) {
     if (-1 != ctx->ggmlop_handle) {
         hexagon_error = ggmlop_dsp_close(ctx->ggmlop_handle);
         if (AEE_SUCCESS != hexagon_error) {
-            GGMLQNN_LOG_WARN("error 0x%x: failed to close ggmlop handle", hexagon_error);
+            GGMLQNN_LOG_WARN("error 0x%x: failed to close ggmlop dsp handle", hexagon_error);
         } else {
             ctx->ggmlop_handle = -1;
         }
@@ -2119,6 +2124,7 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
     dsptensor_1.data = src1->data;
     dsptensor_2.data = dst->data;
 
+    //make compiler happy
     dsptensor_0.ne[0] = src0->ne[0];
     dsptensor_0.ne[1] = src0->ne[1];
     dsptensor_0.ne[2] = src0->ne[2];
@@ -2153,13 +2159,17 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
     dsptensor_1.data_len = ggml_nbytes(src1);
     dsptensor_2.data_len = ggml_nbytes(dst);
 
+    if ((GGML_OP_MUL_MAT == op->op) && (src0_type != GGML_TYPE_F32)) {
+        dsptensor_0.data_len    = ctx->desired_size;
+    }
+
     dsptensor_0.type = src0->type;
     dsptensor_1.type = src1->type;
     dsptensor_2.type = dst->type;
 
     hexagon_error = op_func(ctx->ggmlop_handle, &dsptensor_0, &dsptensor_1, &dsptensor_2);
     if (AEE_SUCCESS != hexagon_error) {
-        GGMLQNN_LOG_WARN("ggmlop computation fail on cdsp");
+        GGMLQNN_LOG_WARN("ggmlop %s computation fail on cdsp", ggml_op_name(op->op));
     }
 }
 
@@ -2206,11 +2216,11 @@ static const char * ggmlqnn_get_htparch_desc(size_t htp_arch) {
 
 static const char * ggmlqnn_get_inference_approach_name(int inference_approach) {
     switch (inference_approach) {
-        case 0:
+        case QNN_GENERAL:
             return "QNN_GENERAL";
-        case 1:
+        case DIRECT_USE_CDSP:
             return "DIRECT_USE_CDSP";
-        case 2:
+        case QNN_SINGLEGRAPH:
             return "QNN_SINGLEGRAPH";
         default:
             return "unknown approach";
@@ -2437,9 +2447,6 @@ static void ggmlqnn_get_graphkey_from_cgraph(const ggml_cgraph * cgraph, std::st
         return;
     }
 
-    //output += "cgraph_" + std::to_string(ggml_time_us());
-    //return;
-
     bool is_start = true;
     for (int i = 0; i < cgraph->n_nodes; ++i) {
         auto * op = cgraph->nodes[i];
@@ -3108,7 +3115,7 @@ Qnn_MemHandle_t  qnn_instance::register_rpcmem(void * p_data, const uint32_t ran
         return _qnn_rpc_buffer_to_handles[p_data];
     }
 
-    auto mem_fd = rpcmem_to_fd(p_data);
+    int32_t mem_fd = rpcmem_to_fd(p_data);
     if (mem_fd == -1) {
         GGMLQNN_LOG_WARN("failed to get file descriptor");
         return nullptr;
@@ -3121,7 +3128,7 @@ Qnn_MemHandle_t  qnn_instance::register_rpcmem(void * p_data, const uint32_t ran
             {{mem_fd}}
     };
     Qnn_MemHandle_t handle = nullptr;
-    auto error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle);
+    Qnn_ErrorHandle_t error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle);
     if (error != QNN_SUCCESS) {
         GGMLQNN_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error));
         return nullptr;
@@ -3497,7 +3504,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         const QnnDevice_PlatformInfo_t * p_info = nullptr;
         qcom_socinfo soc_info = {};
         qnnstatus = _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
-        if (qnnstatus == QNN_SUCCESS) {
+        if (QNN_SUCCESS == qnnstatus) {
             GGMLQNN_LOG_INFO("device counts %d\n", p_info->v1.numHwDevices);
             QnnDevice_HardwareDeviceInfo_t *         infos    = p_info->v1.hwDevices;
             QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {};
@@ -3518,8 +3525,8 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         }
 
         QnnHtpDevice_CustomConfig_t soc_customconfig;
-        soc_customconfig.option   = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
-        soc_customconfig.socModel = soc_info.soc_model;
+        soc_customconfig.option    = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
+        soc_customconfig.socModel  = soc_info.soc_model;
         QnnDevice_Config_t soc_devconfig;
         soc_devconfig.option       = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
         soc_devconfig.customConfig = &soc_customconfig;
@@ -3590,9 +3597,8 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
     _pfn_rpc_mem_alloc  = reinterpret_cast<pfn_rpc_mem_alloc>(dlsym(_rpc_lib_handle,"rpcmem_alloc"));
     _pfn_rpc_mem_free   = reinterpret_cast<pfn_rpc_mem_free>(dlsym(_rpc_lib_handle, "rpcmem_free"));
     _pfn_rpc_mem_to_fd  = reinterpret_cast<pfn_rpc_mem_to_fd>(dlsym(_rpc_lib_handle,"rpcmem_to_fd"));
-    if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free
-        || nullptr == _pfn_rpc_mem_to_fd) {
-        GGMLQNN_LOG_WARN("unable to access symbols in QNN RPC lib. dlerror(): %s", dlerror());
+    if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || nullptr == _pfn_rpc_mem_to_fd) {
+        GGMLQNN_LOG_WARN("unable to access symbols in QNN RPC lib, dlerror(): %s", dlerror());
         dlclose(_rpc_lib_handle);
         return 8;
     }
@@ -3613,13 +3619,13 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
 
     if (_backend_name.find("Htp") != std::string::npos) {
         htp_print_info();
-
         htp_probe_rpc_meminfo();
 
         if (0 != htp_init_perfinfra()) {
             GGMLQNN_LOG_WARN("initialize HTP performance failure");
         }
 #if 1
+        //FIXME: ht_set_rpc_polling + htp_set_high_performance_mode should be equivalent to htp_enter_performance_mode
         if (0 != htp_set_rpc_polling()) {
             GGMLQNN_LOG_WARN("set RPC polling failure");
         }
@@ -3627,13 +3633,14 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
             GGMLQNN_LOG_WARN("set HTP high performance mode failure");
         }
 #else
-        htp_set_memory_grow_size();
         htp_enter_performance_mode();
 #endif
+        htp_set_memory_grow_size();
+
         if (enable_qnn_rpc()) {
-            GGMLQNN_LOG_INFO("NPU RPC feature enabled");
+            GGMLQNN_LOG_INFO("NPU RPC feature enabled with QNN-NPU backend");
         } else {
-            GGMLQNN_LOG_INFO("NPU RPC feature disabled");
+            GGMLQNN_LOG_INFO("NPU RPC feature disabled with QNN-NPU backend");
         }
     }
 
@@ -3657,7 +3664,7 @@ int qnn_instance::qnn_finalize() {
     if (nullptr != _pfn_rpc_mem_deinit)
         _pfn_rpc_mem_deinit();
 
-    if (dlclose(_rpc_lib_handle) != 0) {
+    if (0 != dlclose(_rpc_lib_handle)) {
         GGMLQNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror());
     } else {
         GGMLQNN_LOG_DEBUG("succeed to close rpcmem lib\n");
@@ -3713,10 +3720,9 @@ int qnn_instance::qnn_finalize() {
     }
 
     unload_backend();
-
     unload_system();
-    GGMLQNN_LOG_DEBUG("leave %s\n", __func__);
 
+    GGMLQNN_LOG_DEBUG("leave %s\n", __func__);
     return ret_status;
 }
 
@@ -3727,7 +3733,7 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend devi
     GGMLQNN_LOG_DEBUG("[%s][%s]created", ggml_backend_qnn_get_devname(device), graph_name.c_str());
 
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    if (device == QNN_BACKEND_NPU) {
+    if (QNN_BACKEND_NPU == device) {
         QnnHtpGraph_CustomConfig_t hvx_config;
         hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
         hvx_config.numHvxThreads = hvx_threads;
@@ -3781,7 +3787,7 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend devi
     } else {
         error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), nullptr, &_qnn_graph_handle);
     }
-    if (error != QNN_SUCCESS) {
+    if (QNN_SUCCESS != error) {
         GGMLQNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s",
                       ggml_backend_qnn_get_devname(device), graph_name.c_str(),
                       ggmlqnn_get_qnnerror_string(error));
@@ -3789,7 +3795,7 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend devi
     }
 
     GGMLQNN_LOG_DEBUG("[%s]create graph %s succeed", ggml_backend_qnn_get_devname(device), graph_name.c_str());
-    if (device == QNN_BACKEND_NPU) {
+    if (QNN_BACKEND_NPU == device) {
         htp_set_n_hvx_threads(hvx_threads);
     }
     return QNN_SUCCESS;
@@ -3797,7 +3803,7 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend devi
 
 int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation,
                                  const QnnGraph_Config_t ** graph_configs) {
-    int result = 0;
+    Qnn_ErrorHandle_t result = 0;
 
     if (nullptr == graph_name) {
         GGMLQNN_LOG_WARN("graph name is null\n");
@@ -3813,19 +3819,19 @@ int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do
         GGMLQNN_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n");
     }
 
-    _graph_name = graph_name;
-    _debug_tensor = debug;
-    _do_node_validations = do_node_validation;
+    _graph_name             = graph_name;
+    _debug_tensor           = debug;
+    _do_node_validations    = do_node_validation;
 
     result = _qnn_raw_interface.graphCreate(_qnn_context_handle,
                                             graph_name,
                                             graph_configs,
                                             &_qnn_graph_handle);
-    if (result != QNN_GRAPH_NO_ERROR || nullptr == _qnn_graph_handle) {
+    if (QNN_GRAPH_NO_ERROR != result || nullptr == _qnn_graph_handle) {
         GGMLQNN_LOG_WARN("failed to create graph in qnn context\n");
         return 3;
     } else {
-        GGMLQNN_LOG_INFO("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle);
+        GGMLQNN_LOG_DEBUG("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle);
     }
 
     return 0;
@@ -3848,8 +3854,8 @@ int qnn_instance::finalize_qnn_graph() {
 
 int qnn_instance::htp_init_perfinfra() {
     QnnDevice_Infrastructure_t device_infra = nullptr;
-    int error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra);
-    if (error != QNN_SUCCESS) {
+    Qnn_ErrorHandle_t error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra);
+    if (QNN_SUCCESS != error) {
         GGMLQNN_LOG_WARN("failed to get qnn device infra\n");
         return 1;
     }
@@ -3964,8 +3970,8 @@ void qnn_instance::htp_set_memory_grow_size(size_t size) {
             &grow_size_config,
             nullptr,
     };
-    Qnn_ErrorHandle_t ret = _qnn_htp_perfinfra->setMemoryConfig(_qnn_htp_device_id, _qnn_htp_core_id, memory_config);
-    if (ret != QNN_SUCCESS) {
+    Qnn_ErrorHandle_t result = _qnn_htp_perfinfra->setMemoryConfig(_qnn_htp_device_id, _qnn_htp_core_id, memory_config);
+    if (QNN_SUCCESS != result) {
         GGMLQNN_LOG_WARN("failed to set HTP memory config");
     } else {
         GGMLQNN_LOG_INFO("succeed to set HTP memory config");
@@ -3984,8 +3990,8 @@ void qnn_instance::htp_set_n_hvx_threads(size_t n_threads) {
     };
 
     const QnnGraph_Config_t * graph_configs[] = {&hvx_thread_config, nullptr};
-    Qnn_ErrorHandle_t ret     = _qnn_raw_interface.graphSetConfig(_qnn_graph_handle, graph_configs);
-    if (ret != QNN_SUCCESS) {
+    Qnn_ErrorHandle_t result     = _qnn_raw_interface.graphSetConfig(_qnn_graph_handle, graph_configs);
+    if (QNN_SUCCESS != result) {
         GGMLQNN_LOG_WARN("failed to set QNN graph config: set hvx threads %d", n_threads);
     } else {
         GGMLQNN_LOG_INFO("succeed to set QNN graph config: set hvx threads %d", n_threads);
@@ -4319,7 +4325,7 @@ static void ggmlqnn_load_cfg() {
     memset(time_string, 0, GGML_QNN_TMPBUF_LEN);
     ggmlqnn_get_timestring(time_string);
     GGMLQNN_LOG_DEBUG("program running start time:%s", time_string);
-    ggmlqnn_disable_android_tags(1);
+    ggmlqnn_disable_android_tags(0);
 
     std::string cfg_filename = std::string(g_qnn_params.qnn_runtimelib_path) + std::string(g_qnn_params.qnn_cfgfilename);
     GGMLQNN_LOG_INFO("load ggml-qnn config from %s", cfg_filename.c_str());
@@ -4382,14 +4388,22 @@ static bool ggmlhexagon_can_handle_op(const ggml_backend_qnn_context * ctx, cons
     struct ggml_tensor * src0 = op_tensor->src[0];
     struct ggml_tensor * src1 = op_tensor->src[1];
     const int64_t ne00  = op_tensor->src[0]->ne[0];
-    uint32_t src0_rank  = ggml_n_dims(src0);
+    uint32_t src0_rank  = 0;
     uint32_t src1_rank  = 0;
+    if (nullptr != src0) {
+        src0_rank = ggml_n_dims(src0);
+    } else {
+        GGMLQNN_LOG_DEBUG("op name %s\n", ggml_op_name(op_tensor->op));
+    }
     if (nullptr != src1) {
         src1_rank = ggml_n_dims(src1);
+    } else {
+        GGMLQNN_LOG_DEBUG("op name %s\n", ggml_op_name(op_tensor->op));
     }
 
     //FIXME: mulmat on cDSP doesn't work as expected
-    if (op_tensor->op != GGML_OP_ADD)
+    bool support =  ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
+    if (!support)
         return false;
 
     ggmlqnn_dump_op_info(op_tensor);
@@ -4397,7 +4411,12 @@ static bool ggmlhexagon_can_handle_op(const ggml_backend_qnn_context * ctx, cons
         return false;
     }
 
-    return ggmlqnn_same_types(ctx, op_tensor);
+    support = ggmlqnn_same_types(ctx, op_tensor);
+    if (!support) {
+        return false;
+    }
+
+    return (src0_rank <= 2);
 }
 
 static bool ggmlqnn_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * op_tensor) {
@@ -4406,7 +4425,13 @@ static bool ggmlqnn_can_handle_op(const ggml_backend_qnn_context * ctx, const st
     }
 
     if (DIRECT_USE_CDSP == g_qnn_params.inference_approach) {
-        return ggmlhexagon_can_handle_op(ctx, op_tensor);
+        //return ggmlhexagon_can_handle_op(ctx, op_tensor);
+        //bool support =  ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
+        //FIXME: mulmat on cDSP doesn't work as expected
+        bool support =  (op_tensor->op == GGML_OP_ADD);
+        if (!support)
+            return false;
+
     }
 
     if (!ggmlqnn_k_op_caps[ggmlqnn_get_op_index(op_tensor)].supported) {
@@ -4445,7 +4470,7 @@ static bool ggmlqnn_can_handle_op(const ggml_backend_qnn_context * ctx, const st
                 return false;
             }
 
-            if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix
+            if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix mul
                 return false;
 
             return ggmlqnn_same_types(ctx, op_tensor);
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
index b5cfd8810cfe6..fa00d9bc5614f 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
@@ -1,3 +1,24 @@
+/*
+* Copyright (c) 2023-2025 The ggml authors
+*
+* Permission is hereby granted, free of charge, to any person obtaining a copy
+* of this software and associated documentation files (the "Software"), to
+* deal in the Software without restriction, including without limitation the
+* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+* sell copies of the Software, and to permit persons to whom the Software is
+* furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice shall be included in
+* all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*/
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
@@ -7,45 +28,32 @@
 #include "HAP_farf.h"
 #include "ggmlop_ap_skel.h"
 
-#define ggml_tensor dsptensor
-
-int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
-    void *tptr = NULL;
-    FARF(HIGH, "uri %s", uri);
-    tptr = (void *)malloc(1);
-    *handle = (remote_handle64)tptr;
-    assert(*handle);
-    return 0;
-}
-
-int ggmlop_dsp_close(remote_handle64 handle) {
-    if (handle)
-        free((void*)handle);
-    return 0;
-}
-
-static void ggml_dump_tensor(struct ggml_tensor * tensor) {
-    FARF(HIGH, "ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)\n",
-         tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
-         tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[3]);
-}
-
-static void ggml_abort(const char * file, int line, const char * fmt, ...) {
-    //abort();
-    return;
-}
+#define ggml_tensor         dsptensor
 
 #define GGML_MAX_DIMS       4
 #define GGML_UNUSED(x)      (void)(x)
+#define UNUSED              GGML_UNUSED
 #define GGML_PAD(x, n)      (((x) + (n) - 1) & ~((n) - 1))
 #define GGML_ABORT(...)     ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
 #define GGML_ASSERT(x)      if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
 #define MIN(a, b)           ((a) < (b) ? (a) : (b))
 #define MAX(a, b)           ((a) > (b) ? (a) : (b))
+#define SWAP(x, y, T)       do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
+#if UINTPTR_MAX == 0xFFFFFFFF
+#define GGML_MEM_ALIGN      4
+#else
+#define GGML_MEM_ALIGN      16
+#endif
+
 #define GGML_RESTRICT
 
 #define static_assert(a, b) do { } while (0)
 
+typedef uint16_t ggml_fp16_t;
+typedef struct { uint16_t bits; } ggml_bf16_t;
+typedef double ggml_float;
+
+
 #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
     const type prefix##0 = (pointer)->array[0]; \
     GGML_UNUSED(prefix##0);
@@ -125,10 +133,124 @@ enum ggml_type {
     GGML_TYPE_COUNT   = 39,
 };
 
+static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
+static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
+static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
+
+typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
+                                 const void * GGML_RESTRICT y, size_t by, int nrc);
+typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
+
+typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
+
+struct ggml_type_traits {
+    const char             * type_name;
+    int64_t                  blck_size;
+    int64_t                  blck_size_interleave; // interleave elements in blocks
+    size_t                   type_size;
+    bool                     is_quantized;
+    ggml_to_float_t          to_float;
+    ggml_from_float_t        from_float_ref;
+};
+
+struct ggml_type_traits_cpu {
+    ggml_from_float_t        from_float;
+    ggml_vec_dot_t           vec_dot;
+    enum ggml_type           vec_dot_type;
+    int64_t                  nrows; // number of rows to process simultaneously
+};
+
+static const struct ggml_type_traits_cpu type_traits_cpu[1] = {
+        [GGML_TYPE_F32] = {
+                .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
+                .vec_dot_type             = GGML_TYPE_F32,
+                .nrows                    = 1,
+        },
+};
+
+static const struct ggml_type_traits type_traits[1] = {
+        [GGML_TYPE_F32] = {
+                .type_name                = "f32",
+                .blck_size                = 1,
+                .type_size                = sizeof(float),
+                .is_quantized             = false,
+        },
+
+};
+
+static const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
+    return &type_traits_cpu[type];
+}
+
+static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
+    assert(nrc == 1);
+            UNUSED(nrc);
+            UNUSED(bx);
+            UNUSED(by);
+            UNUSED(bs);
+    ggml_float sumf = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sumf += (ggml_float)(x[i]*y[i]);
+    }
+    *s = sumf;
+}
+
+static const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
+    return &type_traits[type];
+}
+
+static int64_t ggml_blck_size(enum ggml_type type) {
+    return type_traits[type].blck_size;
+}
+
+static size_t ggml_type_size(enum ggml_type type) {
+    return type_traits[type].type_size;
+}
+
+static size_t ggml_row_size(enum ggml_type type, int64_t ne) {
+    assert(ne % ggml_blck_size(type) == 0);
+    return ggml_type_size(type)*ne/ggml_blck_size(type);
+}
+
+static size_t ggml_nbytes(const struct ggml_tensor * tensor) {
+    size_t nbytes;
+    const size_t blck_size = ggml_blck_size(tensor->type);
+    if (blck_size == 1) {
+        nbytes = ggml_type_size(tensor->type);
+        for (int i = 0; i < GGML_MAX_DIMS; ++i) {
+            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
+        }
+    }
+    else {
+        nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
+        for (int i = 1; i < GGML_MAX_DIMS; ++i) {
+            nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
+        }
+    }
+
+    return nbytes;
+}
+
+static size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
+    return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
+}
+
+static double ggml_type_sizef(enum ggml_type type) {
+    return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
+}
+
+static const char * ggml_type_name(enum ggml_type type) {
+    return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
+}
+
+static bool ggml_is_quantized(enum ggml_type type) {
+    return type_traits[type].is_quantized;
+}
+
 static bool ggml_is_empty(const struct ggml_tensor * tensor) {
     for (int i = 0; i < GGML_MAX_DIMS; ++i) {
         if (tensor->ne[i] == 0) {
-            // empty if any dimension has no elements
             return true;
         }
     }
@@ -161,12 +283,16 @@ static int64_t ggml_nrows(const struct ggml_tensor * tensor) {
     return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
 }
 
+static bool ggml_is_transposed(const struct ggml_tensor * tensor) {
+    return tensor->nb[0] > tensor->nb[1];
+}
+
 static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
-    size_t next_nb = sizeof(float);//ggml_type_size(tensor->type);
-    if (tensor->ne[0] != 1/*ggml_blck_size(tensor->type)*/ && tensor->nb[0] != next_nb) {
+    size_t next_nb = ggml_type_size(tensor->type);
+    if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
         return false;
     }
-    next_nb *= tensor->ne[0]/1/*ggml_blck_size(tensor->type)*/;
+    next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
         if (tensor->ne[i] != 1) {
             if (i > n) {
@@ -193,24 +319,34 @@ static bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
 
 inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
 
-static void ggml_compute_forward_add_f32(
-        const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static void ggml_dump_tensor(const ggml_tensor * tensor) {
+    FARF(HIGH, "ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)\n",
+         tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
+         tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[3]);
+}
 
-    ggml_dump_tensor(src0);
-    ggml_dump_tensor(src1);
-#if 1
-    float * a = (float*)src0->data;
-    float * b = (float*)src1->data;
-    float * c = (float*)dst->data;
-    //TODO: Hexagon SIMD
-    for (size_t idx = 0; idx < src0->data_len; idx++) {
-        *c = *a + *b;
-        a++;
-        b++;
-        c++;
-    }
+static void ggml_abort(const char * file, int line, const char * fmt, ...) {
+    //abort();
     return;
-#endif
+}
+
+int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
+    void *tptr = NULL;
+    FARF(HIGH, "uri %s", uri);
+    tptr = (void *)malloc(1);
+    *handle = (remote_handle64)tptr;
+    assert(*handle);
+    return 0;
+}
+
+int ggmlop_dsp_close(remote_handle64 handle) {
+    if (handle)
+        free((void*)handle);
+    return 0;
+}
+
+static void ggml_compute_forward_add_f32(
+        const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
 
     const int ith = 0;
@@ -230,6 +366,23 @@ static void ggml_compute_forward_add_f32(
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
 
+    ggml_dump_tensor(src0);
+    ggml_dump_tensor(src1);
+
+#if 1 //naive algorithm, can works with llama-cli
+    float * a = (float*)src0->data;
+    float * b = (float*)src1->data;
+    float * c = (float*)dst->data;
+    //TODO: Hexagon SIMD
+    for (size_t idx = 0; idx < src0->data_len; idx++) {
+        *c = *a + *b;
+        a++;
+        b++;
+        c++;
+    }
+    return;
+#endif
+
     if (nb10 == sizeof(float)) {
         for (int ir = ir0; ir < ir1; ++ir) {
             // src1 is broadcastable across src0 and dst in i1, i2, i3
@@ -247,11 +400,7 @@ static void ggml_compute_forward_add_f32(
             float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
 
             for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
-#else
                 ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
             }
         }
     } else {
@@ -290,55 +439,8 @@ int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tenso
             } else {
                 GGML_ABORT("fatal error");
             }
-        } break;
-        case GGML_TYPE_F16:
-        {
-            if (src1->type == GGML_TYPE_F16) {
-                //ggml_compute_forward_add_f16_f16(dst);
-            }
-            else if (src1->type == GGML_TYPE_F32) {
-                //ggml_compute_forward_add_f16_f32(dst);
-            }
-            else {
-                GGML_ABORT("fatal error");
-            }
-        } break;
-        case GGML_TYPE_BF16:
-        {
-            if (src1->type == GGML_TYPE_BF16) {
-                //ggml_compute_forward_add_bf16_bf16(dst);
-            }
-            else if (src1->type == GGML_TYPE_F32) {
-                //ggml_compute_forward_add_bf16_f32(dst);
-            }
-            else {
-                GGML_ABORT("fatal error");
-            }
-        } break;
-        case GGML_TYPE_Q4_0:
-        case GGML_TYPE_Q4_1:
-        case GGML_TYPE_Q5_0:
-        case GGML_TYPE_Q5_1:
-        case GGML_TYPE_Q8_0:
-        case GGML_TYPE_Q2_K:
-        case GGML_TYPE_Q3_K:
-        case GGML_TYPE_Q4_K:
-        case GGML_TYPE_Q5_K:
-        case GGML_TYPE_Q6_K:
-        case GGML_TYPE_TQ1_0:
-        case GGML_TYPE_TQ2_0:
-        case GGML_TYPE_IQ2_XXS:
-        case GGML_TYPE_IQ2_XS:
-        case GGML_TYPE_IQ3_XXS:
-        case GGML_TYPE_IQ1_S:
-        case GGML_TYPE_IQ1_M:
-        case GGML_TYPE_IQ4_NL:
-        case GGML_TYPE_IQ4_XS:
-        case GGML_TYPE_IQ3_S:
-        case GGML_TYPE_IQ2_S:
-        {
-            //ggml_compute_forward_add_q_f32(dst);
-        } break;
+            break;
+        }
         default:
         {
             GGML_ABORT("fatal error");
@@ -349,124 +451,204 @@ int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tenso
 }
 
 
-int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    FARF(HIGH, "===============     DSP: ggmlop_dsp_mulmat ");
+static void ggml_compute_forward_mul_mat_one_chunk(
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst,
+        const enum ggml_type type,
+        const int64_t num_rows_per_vec_dot,
+        const int64_t ir0_start,
+        const int64_t ir0_end,
+        const int64_t ir1_start,
+        const int64_t ir1_end) {
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
-    ggml_dump_tensor(src0);
-    ggml_dump_tensor(src1);
+    const bool src1_cont = ggml_is_contiguous(src1);
+
+    ggml_vec_dot_t const vec_dot      = type_traits_cpu[type].vec_dot;
+    enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
+
+    // broadcast factors
+    const int64_t r2 = ne12 / ne02;
+    const int64_t r3 = ne13 / ne03;
+
+    //printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
+
+    // threads with no work simply yield (not sure if it helps)
+    if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
+       return;
+    }
+
+    //FIXME:hardcode to src1->data
+    const void * wdata = src1->data;
+    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+    assert(ne12 % ne02 == 0);
+    assert(ne13 % ne03 == 0);
+
+    // block-tiling attempt
+    const int64_t blck_0 = 16;
+    const int64_t blck_1 = 16;
+
+    const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
+
+    // attempt to reduce false-sharing (does not seem to make a difference)
+    // 16 * 2, accounting for mmla kernels
+    float tmp[32];
+
+    for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
+        for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
+            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
+                const int64_t i13 = (ir1 / (ne12 * ne1));
+                const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
+                const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
+
+                // broadcast src0 into src1
+                const int64_t i03 = i13 / r3;
+                const int64_t i02 = i12 / r2;
+
+                const int64_t i1 = i11;
+                const int64_t i2 = i12;
+                const int64_t i3 = i13;
+
+                const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
+
+                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                //       the original src1 data pointer, so we should index using the indices directly
+                // TODO: this is a bit of a hack, we should probably have a better way to handle this
+                const char * src1_col = (const char*)wdata +
+                                        (src1_cont || src1->type != vec_dot_type
+                                         ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
+                                         : (i11 * nb11 + i12 * nb12 + i13 * nb13));
+                float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
 
-    const int vec_dot_type = 0;
-    int64_t const vec_dot_num_rows = 1;
+                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
+                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+                //}
+
+                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
+                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
+                }
+
+                for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
+                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
+                }
+            }
+        }
+    }
+}
+
+int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    enum ggml_type           const vec_dot_type         = type_traits_cpu[src0->type].vec_dot_type;
+    ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
+    int64_t                  const vec_dot_num_rows     = type_traits_cpu[src0->type].nrows;
 
     GGML_ASSERT(ne0 == ne01);
     GGML_ASSERT(ne1 == ne11);
     GGML_ASSERT(ne2 == ne12);
     GGML_ASSERT(ne3 == ne13);
 
-    GGML_ASSERT(nb00 == sizeof(float));
-    GGML_ASSERT(nb10 == sizeof(float));
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(src0->type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
 
+    // dst cannot be transposed or permuted
     GGML_ASSERT(nb0 == sizeof(float));
     GGML_ASSERT(nb0 <= nb1);
     GGML_ASSERT(nb1 <= nb2);
     GGML_ASSERT(nb2 <= nb3);
 
+#if 1 //naive algorithm for fp32, can pass various case in UT
+     {
+        ggml_dump_tensor(src0);
+        ggml_dump_tensor(src1);
+
+        float * a = (float*)src0->data;
+        float * b = (float*)src1->data;
+        float * c = (float*)dst->data;
+        int M = src0->ne[1];
+        int K = src0->ne[0];
+        int N = src1->ne[1];
+         float sum = 0;
+        for (int i = 0; i < M; i++) {
+            for (int j = 0; j < N; j++) {
+                sum = 0;
+                for (int h = 0; h < K; h++) {
+                    sum += a[i * K + h] * b[h * N + j];
+                }
+                c[i * N + j] = sum;
+            }
+        }
+        return 0;
+    }
+#endif
+
+    // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
     const int64_t nr0 = ne0;
+
+    // This is the size of the rest of the dimensions of the result
     const int64_t nr1 = ne1 * ne2 * ne3;
 
+    // Now select a reasonable chunk size.
     int chunk_size = 16;
-    int nth = 1;
 
+    // We need to step up the size if it's small
     if (nr0 == 1 || nr1 == 1) {
         chunk_size = 64;
     }
 
+    // distribute the work across the inner or outer loop based on which one is larger
+    // The number of chunks in the 0/1 dim.
+    // CEIL(nr0/chunk_size)
     int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
     int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
 
-    if (nchunk0 * nchunk1 < nth * 4) {
-        nchunk0 = nr0 > nr1 ? nth : 1;
-        nchunk1 = nr0 > nr1 ? 1 : nth;
+    // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
+    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggml-org/llama.cpp/pull/6915
+    //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
+    if (nchunk0 * nchunk1 <  4) {
+        // distribute the thread work across the inner or outer loop based on which one is larger
+        nchunk0 =  1; // parallelize by src0 rows
+        nchunk1 =  1; // parallelize by src1 rows
     }
 
+    // The number of elements in each chunk
     const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
     const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
 
+    // The first chunk comes from our thread_id, the rest will get auto-assigned.
     int current_chunk = 0;
 
-    const int64_t ith0 = current_chunk % nchunk0;
-    const int64_t ith1 = current_chunk / nchunk0;
-
-    const int64_t ir0_start = dr0 * ith0;
-    const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
-
-    const int64_t ir1_start = dr1 * ith1;
-    const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
-
-    int64_t num_rows_per_vec_dot = vec_dot_num_rows;
-
-    const int src1_cont = ggml_is_contiguous(src1);
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
-
-    const void * wdata = src1->data;
-    const size_t row_size = sizeof(float) * ne10;
-    assert(ne12 % ne02 == 0);
-    assert(ne13 % ne03 == 0);
+    while (current_chunk < nchunk0 * nchunk1) {
+        const int64_t ith0 = current_chunk % nchunk0;
+        const int64_t ith1 = current_chunk / nchunk0;
 
-    const int64_t blck_0 = 16;
-    const int64_t blck_1 = 16;
+        const int64_t ir0_start = dr0 * ith0;
+        const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
 
-    float tmp[32];
+        const int64_t ir1_start = dr1 * ith1;
+        const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
 
-    for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
-        for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
-            for (int64_t ir1 = iir1;
-                    ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
-                const int64_t i13 = (ir1 / (ne12 * ne1));
-                const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
-                const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
+        // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
+        int64_t num_rows_per_vec_dot = vec_dot_num_rows;
 
-                const int64_t i03 = i13 / r3;
-                const int64_t i02 = i12 / r2;
-
-                const int64_t i1 = i11;
-                const int64_t i2 = i12;
-                const int64_t i3 = i13;
-
-                const char * src0_row = (const char *)src0->data + (0 + i02 * nb02 + i03 * nb03);
-
-                const char * src1_col = (const char *)wdata +
-                                       (src1_cont || src1->type != vec_dot_type
-                                        ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
-                                        : (i11 * nb11 + i12 * nb12 + i13 * nb13));
-                float * dst_col = (float *)((char *) dst->data +
-                                            (i1 * nb1 + i2 * nb2 + i3 * nb3));
-
-
-                for (int64_t ir0 = iir0;
-                        ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
-
-                    float sumf = 0.0;
-                    const float * GGML_RESTRICT x = (float*)src0_row + ir0 * nb01;
-                    const float * GGML_RESTRICT y = (float*)src1_col;
-                    float * GGML_RESTRICT s = &tmp[ir0 - iir0];
-                    for (int i = 0; i < ne00; i++) {
-                        sumf += x[i] * y[i];
-                    }
-                    *s = sumf;
-
-                }
+        // these checks are needed to avoid crossing dim1 boundaries
+        // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
+        if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
+            num_rows_per_vec_dot = 1;
+        }
+        ggml_compute_forward_mul_mat_one_chunk(src0, src1, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
 
-                for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
-                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16),
-                           (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
-                }
-            }
+        if (1 >= nchunk0 * nchunk1) {
+            break;
         }
+        current_chunk++;
     }
 
-    return 0;
+   return 0;
 }
diff --git a/ggml/src/ggml-qnn/kernels/libggmlop_skel.so b/ggml/src/ggml-qnn/kernels/libggmlop_skel.so
index 8695cbf36949a45dfd7ce3d7cec38ad6f8fea6c4..b92e38c950500e09eb9e76cd961c2de86de2898d 100755
GIT binary patch
delta 3151
zcma);e{56N701tePW&s0@sHR}91|RSB*CLJBmtoeMK5UzBm<gOAhjcdl!mTJ3-xHi
zOn%fUWEExM$II=euB27MrY$W<Sh$nWRJ1oY-Pi;SIuu3Srl$R43T;_nWLsM#vhVSa
z)Uq^bM>_AE&%Niq`|I8F?!>dRT_=6gpjsBtcesf<+-i#__p(fMS?#<R(M&{PiO5bS
zDkq|Mp#4xAN%R2hB_kdXE9&c#ZsKXjvmuLr03J)l^ZR}T&d2+`u*{>dm42eFur^vk
zzb1;nO$xv8w`qW7{u|VX205_YAO{1zh7Q&z>P=J#P7BAEWUv)nk-=PF#q1(#g0YjC
z)=G~MDS1Q_=Hv)l5m1Q{zm^Cv(JruJAbKvr2HKPL!o3;H9rlUqav+wgdqNU>avtg|
zMiFaTIE01QC>QF4^0-*AAy^`!0%$(85Nd&9bc=#fCe~sJET%!&PSQb()H`mWNIB}f
zRWhh|0!}>&SsY1|c-q7db2oazXm`U$6hPS^{OOcG>k@y@{VKxSH;ff8D@wGXG5CHS
z4&>+Tr^+YT=k6hA+2`RvqU@)p`40Q3kz8OO3(4{>`#c4R#=e~NC;ixBzby%uxFS!z
z;}K&9t9>N0u;l3PDO@xiy!&VP6c=&BR{gUIKCWxRWei5xwP?*<UhTL_+UR3l%pYd)
zjV!*J#h0=;{xx^<`~I56Q;RG;CKO|pu$l^`*TiyDg>7VE;R&BYVXN|F$m*T{Y((re
zxtFiOTd1YEi<afovoCFH?mB`<S;L`g9Q?q1iQ0z5Tc%$|6tr=gmrq8O_w5F=<<pmy
z!|GWr{*A%-klNI!AKs`OQr`p{(Z`|2L+au9VYOhR@mBr@<1Lf+RA{r3Ke;7dFlklF
z+exEHZG&Oht8|7eh<{Ks9B?RmL*5S4z4n>SwyDVcZ<r<2m1(c`c$>VDyEbK1&C^~{
zolV}mbq&6@pmI!Y)x@cnT5NNQ@@kJ=dm-d)a^5XOoK5?&N)4`dpTcoM-L8$CI^LQ$
z$FUBy&(pd?R;35=jzoM=8&PQxhhL${c4g$0O%p1|=PBKx>8dkTFSSH$b6h~$i5^RX
z<+DPK+FnsiQEp_sI(){Z(c`3?==mbN(8e#}8FKv+bS`P5>hVXUxkT)Zvu7$bV^n(B
z_|NVy7uw8dCqJyeC215s^5Vjd#<)C;_F-qKx86`{sV~(^e}Ww%5(Czc<lYy>PFql(
z9}sUkU1H2OWR4Cj>{xZ<;`rMGVwL@>r3uL07foFpUq2wsjw$)%fcV5wZ;tja?2z6o
z96#GHf`$JmU&?#Exw&gSib>laDr}(W?U4I|8|8{dR;Sh;s_qx{&Ykk7edz6V^Dp|6
zJ&pB=MpxXd0E^IZU7xVHr=&*lvBM$0a95h6y{V3RMX+d}yscNfUbI1O?-idEm6v)^
zsbPBPCM-|KzE!BShc0MjFW}*96XnIF{<R2s(0JmejF2LQ`s*;7d{jFhZ|;)fZ;Y?%
z6;Bt}$;MuBx;VJj%nf%ocWG7rm7gEvxlqJ|mL*W02iH5dHg^?k=n!S}!Nu{JgF^PK
zmES!m>OA%K?nJ8>LZO814ow~uM?L$@wW;DN1edf(SHzZ*=DeDkb-v&;Pwwk@ZjTr*
zamRe#8Qtlny7NmXWX^Vv9!2)$1U!NVr^->-bc*Gi#hIJaDd)-@m=GR%SpE%gTIE<y
z!<tO^84#Tl+kF+b351}4hM=6W6miV=keTx)=gV1F>_gw%l00TaZdw^d!2he_p1&;i
zFaMXin5jZ1DZ38rd1_b3^WD3iuBi@G9(ca9V@05{v!;5bFVww%rSE@ws8|tL9SFp3
zR{TZU%~N90B_^1ha*}!I$-yNVi5-&uJ?j$x8WdnkEV{%35N%rI<bS8#Ey{z(BV6o3
zn1ppZ5Bm(5SC9D$EUylC!2B^Ruhi}o&%w@un^OEA*ed)w_-DuQ_hC1IdCi#(*qwH8
zQu4_o;e2Nnd=mjtUwQx*S_hs1Z$k?SAA$m2SBAf0iutGB3T{pD4)9n;{rA8ZGI&4u
z8aN#vmT*f6`_VcfR(cr$M;6&e0(>&XlI)<jz-i%C`c)Rcm%;q||6>+k&*JGwR>4PE
zJe$S0!MvO-R=Tqw0W#SE$h_!zz?s(^A2|J`@Wm}si-OFXU;{YwBK9aa^QN~OoOzG>
zK3Kzsr*E2v!3w^x^u6m1Fn@~tjqt_nXB_WK-W+XYiO@L|`21hr1gF56SIK#BKk8Ym
zvXvxUiv|}NcSrE`G-E}a;I!}-d^>|V>^m83CvpFQKgI5^3P`cas_#moOP4QBuih{s
z6>}OApRZmgofONfmzgv<nM}@!Eg(Ldtk*<O^|D+AnMM&mtqzzt!LSaA_qiMgRS`F<
z1LkxhnIYk-DK+y3ux=_4^);pX#7T9D-8Fs_pGH<$9IIKT=cCc?j-9XZNYZ+7r8c0y
zj2V2DBmQ0M*Z;19XoVz7*Z8G5(XggM(N^H}VNiTH%As%*C;1Ps8FQxFxcnc;C`f-{
F{{^N{YfS(E

delta 4681
zcmb7Idr*|u6~Eud?y|cgF1S2}Wmz8(#vlSJ@xgp7l!u~29m5dVMle1RbsMdqp;_XC
zYR4GAe3{fS(qT=LkW4zY#j(ybX+M`8)YLecT|&_K>4va2Nt>)TkV#D{{hfUnvj6qY
zoIU4v?z#7#d+y^qcRy?G+kV2#$zt|wjVhBdRi?NyT|LAzHY9HQ&cVVZ5qvJFPS4mJ
z4P%K+$Cdz}19gHD7{^M%t9AH6F`#}`NDTf``?XQ{2(US9PyL++J`J1!PWbnXIascR
zu{BU6uo>*{;E_TddLjUv@K+!+>{Z}opTsfNj1HFv>ve1{aHNp^{3+M~TrveyeHpil
zF<};C6#$7$&#GY11<O7}fjn#hzJLKv2J3aq4%{2dSX&Usvb|9qJ{yG(Pr>AXB>j<}
zZCJm4>!yyDExUHE-?(k>#%)FYUcQmj?AWy9Df>QYzN%seJYq!2pcGJ~BtU>?m^ouu
zK&AynN;-tJFrc}mhUFx18k)*<&=ijt)|u(Vdnwans5V^G;=Q6Yif}wA4m1;lIWr?@
znzTY)m2Sog4w)I74NWfOk+6IsL;9`yzQc@&9m6tZlTRqUkS#Pe%I`=H>-$I!`}{V^
z;klkAndXXej^wa=7f2>gP~IUqJV%-2a4g&wmcuOJ316lPKGfhU$rKor4@iy^@1;vZ
ztRe9w$K;G(53r8Zk-V=zhs*}04Y6x9tpawcES-<dlbW>!j;(<4w`y85<l`cfPp=K)
zktpnm!dIj4U=)^rz+0&Q8&P=ZAx>EvG`RLq5yF?HZmo$sCB3H2$v!YS`2;5Sqhc0V
zhe2qu8#2cKxnEMWDRZj@7+7msS(d7lWUjWR)sB|DjB{@7^5`!!Tbq=od&6<p;plu|
zSKfF`?&+-+++y*F;!Ht*&5#!jE6#54B_748trh(FkQWarPTgu27_W=>778vg$<^ab
zt*rLz%Hje8x#rHaDs6xhj(dt46=&*I#hF&4323TW$E-DqbAh|Z$%K=hWMST(X4xgG
zts2MfE_9M5D^5+BOUwpO7GB#kPZ$tm2Nh=}xCwkScpP{)cpCU1xEcHocnr9VKKP*C
zE$YyR4>%dvS8w$4WdouXI=((-;tnRDaw%|vIW%Q%F=^1DID4O+Y$64b5skjBOZBD;
z+yphqL^MvI-dCUM4cf@e!8Zzijlg%#ky%H~$1D(YF+Cu-x;=cOYCNVXW!xaB#uF~4
zjPId#E~D2@9DVY22@~mXMBB_deR=n8an%Prl@2R^%=iSe9k|j{#iP?+goPHUqQPt{
zsZgpK29?ULGDR$LD<W6j<zd2RJK3iLr<TT6v&t?{32b3gPPR<ee92<~4oldS_uehe
zddOpdeZ^&^iYsf`&V)mrJlM|}P|BH`ed*w=r##uETs4yqc-TcV41}W@0&B<_de=z}
zXdqrFJDM2QF}Zm=4;y_LQ(cbdUFQSS)za^OiAZ9W{7}-wH;|JNMb<cLf*(nnP_3)?
z|0a*?RDYNpP=90$jD0Z23+Egj*EX~5sH|93BZ6OjodwhvQv<#mbG#IVziu_#4wNh9
zz3|p$?e#Jr>nwx*<ThrTj&)4HEIp>{rNizsUZK9rb8IWK%_~=`_yMJgVXK!3FM8VZ
z+&8tARzHKIS#VTW#Rm>q4~f%BE_Y}GDi|6rE6#KQXG73-@_8|xK3V0ZwesO~@X#4C
z3$0r{Ayjl9xNlAOYMNs^IqVfz<9V-7NcLu6x6i`K7!zLij=GAy6;{<PE0%UC22r~e
zr=22Gt(!1{c$Xs5{<>tkQfhXfQL3xapEi&lVD%0Dv?e~l52s9UAEgGk>&XG%I<pru
zanw-<;Y~`X#TX8)#j1$eHx#F;ex3i$fFe?8Da0kPzdwj5u1lIo2HnjQZ{`1YvT3xU
zN6dmfCs+^bu`e8<jdwtF$^4vA>~cMJefLF0R1I4F>Pvh;J)jAUR-}73LdRQGUq((r
zYBTHn^d~*yDGYktdQkjyphsjv`QiF%v+Y%DPx&6`mzk`kT4AI8Dr8lKfF;;$AI4@r
z=sGBBDr^1fi&{q@7MBTmIUfUz4XV5!E-!E3UFxqjxNydVVs8PCS!KD#ol~IBjYNPV
zQdak(r$8{A%@w*|c|`uzD~GFffp0&#ZSyrSuVys!Y?pA#V}dj4vF>GM!poi;|G8~5
zV$((@z&2!_H3*KK_D4#^y0^ZcY@(UR3wHZ39C4cZN>xI$T_bF;=T<1<zc2$Puk>@P
zQ~};v?RR4hW9Vj+H`-MdwSJ$Kd99avMD_ccfX~fvSHyqK!3jYyy1UW07JUfl<FgU1
z_c<Kn`88%+?-iw7Z~%NWa@%H+Spzv>F(s77VbFMljRxK<Q`lGfXULl|P_<t(*yxX0
z6&Fxni4Vj!%?OOTdGBZ??_G||U1)sS!<(j+-f7Ty`{dg;k7;XZSp`nZ_1iYgy2DVV
zi2oi`#Gy(>JP&>nym&tZlWcnVQTk#G*R<x-*P^nfH6C7*%<y%iI?h(ps)7z0CY4MZ
z<1L+%DK3vYCjHEiA~nYy($_tM7PvK5J|le<cP)Nwe`v?14!`TXxnFv3`Y>PejC3Zx
zR9}Z0ZoufQ-7kF+|JA%K>RqU5y}1t!<frY%GIsW^J>>$vLQIgEAxUnwd-h2WW<1Fs
z*e5+{IHj+IC5<HPllTOe-q$hNlnQvGL%NnQ%zx1#y`H#KUk7b)ln&{O#GU+DyVN+d
zl4l*#nVIuUZ?{i2$p{I3>G+ZUWk#&^1(_wzAYZDbJ2Op|VVEq?zVJB@lT=&{OW{g9
zs*~_3<y<|^YwglJ<5Iq_UD{yGTO3CX*VeSk1(x~8(2{m2E|iX0Aesc#C*gyfCSyS8
z7%qmccFAR2%-h<f`^M5lRj}1=I}7qX_O^CuWztT)doOji+b-{wTuCdrQE7T|&9sF}
z%z0Z|c5ZCj#H5zw6vvh=JKA>H*FUv!w>|%<rNs*un|E$nSZJOLR#0eOvBF%yf+cJR
zv-LZE(YB*yPe{$8+b^N^MWOaZQ`*mG3xeq?5-XO8m%=huf}}CE|Api$F_OTketHIp
zC*_`R;fFHIbKsG@l(I}pt0_mMbeyu!KKy9F3t%@5oNUz~A0NtrFG-J^a}w^t1d8C(
zf-=J+((~qWJ>{pAm!8PzzhQoh<IPBN(+dqq5&u8xf6tPkN?YAxU(vR0LpTz<w{L38
z-@Pk;M~l6GY@V9yug$%{Z6I%{!~}g0%_fA&tI)V19LiHEMGoPRK(h{EeUKk{h-vO9
znbPLG;|@|&!ptEge**6Vroa$>AN&F^CFO)a0`~z^&P#Y4{M#w@4}d99<Pg;-BG44H
z4Pk5sJ^@@2#<?64p%Ey^V`{Jf26ure{)B_CE4(m6C628^J@&#wYy&=PjyN38wgLNq
z*TI6rp7=R%Np|SvLijo0I^eo+{V@)&)!r!vCt+}53Vs#%1K>!5H-S4WXdNsD_GjQD
z4{-u>1D^<Ej-SbH07nW%^-dHXcT7=GSig(HJcS}$s9qO^4N*8P3TH>*T;Pbx!0t^t
z9*a_}0zQX<IzjY8UkgkJsSP-h#j*zA$X28GY#Z>@7yLfpW~}G(puLVA1)lmsKLtGH
z{p(<FLNWC(g1-QFAy$zG^M}B+mub_`gZg7&*&KTECtywg0c^JX@Pd!W(=Hcy4eaSf
zKOeXgn2Lcnu*ZPA9^&!@Rt3fNhyo#54IC*H@`F+Mr74)2y()e3=uKUIZT{{(Pv<Wz
zmYn%1hVX?KQ7-C#Cx1@lJ>GwN!IND4kKgOk(uI%HohH4#u!K7+`4?vCd_49t=#n(O
zFiS_b0g+EyRG6hcf@|xD^hDuo9o<$$r=`Qt(?a!1Zx+thM{Xs?NyCLEJ*^m#Hc@&|
zXwpzTg;|oN$fBe3Pt+={F3Qpb4_9XY(?#hVS0O!LJX<q_DJH3<^Tig;=Q)fGGU-;a
nh5LiVEy@wP9ziJK6g`?DGKj&u;|lncATgHD*x#loB76LQ*lEmv


From 184372fcd32e096021ab48a49835bc93dfdfeec6 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Mon, 24 Mar 2025 22:37:35 +0800
Subject: [PATCH 132/200] ggml-qnn: add build script for libggmlop_skel.so

---
 ggml/src/ggml-qnn/kernels/Makefile          |  24 ++++++++++++++++++++
 ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c     |  23 ++++---------------
 ggml/src/ggml-qnn/kernels/libggmlop_skel.so | Bin 13672 -> 0 bytes
 scripts/build-run-android.sh                |  23 +++++++++++++++++++
 4 files changed, 51 insertions(+), 19 deletions(-)
 create mode 100755 ggml/src/ggml-qnn/kernels/Makefile
 delete mode 100755 ggml/src/ggml-qnn/kernels/libggmlop_skel.so

diff --git a/ggml/src/ggml-qnn/kernels/Makefile b/ggml/src/ggml-qnn/kernels/Makefile
new file mode 100755
index 0000000000000..879bd3444ee1b
--- /dev/null
+++ b/ggml/src/ggml-qnn/kernels/Makefile
@@ -0,0 +1,24 @@
+HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
+
+TARGET=libggmlop_skel.so
+
+INCS=-I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc
+
+CFLAGS=-mv75 -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B ${INCS}
+
+LDFLAGS=-mv75 -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${TARGET}
+
+SRCS = ggmlop_cdsp.c  ggmlop_cdsp_skel.c
+OBJS = $(patsubst %.c, %.o, $(SRCS))
+
+ALL:$(OBJS)
+		${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang ${LDFLAGS} -o ${TARGET} -Wl,--start-group ${OBJS} -Wl,--end-group
+		@ls -l ${TARGET}
+
+%.o:%.c
+		@echo "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang ${CFLAGS} -D__FILENAME__=\"$<\" -o $@ -c $< "
+		${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang ${CFLAGS} -D__FILENAME__=\"$<\" -o $@ -c $<
+		@echo "\n"
+
+clean:
+	rm -f *.o
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
index fa00d9bc5614f..bdb39fdf1f2a3 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
@@ -349,9 +349,6 @@ static void ggml_compute_forward_add_f32(
         const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
 
-    const int ith = 0;
-    const int nth = 1;
-
     const int nr  = ggml_nrows(src0);
 
     GGML_TENSOR_BINARY_OP_LOCALS
@@ -359,17 +356,16 @@ static void ggml_compute_forward_add_f32(
     GGML_ASSERT( nb0 == sizeof(float));
     GGML_ASSERT(nb00 == sizeof(float));
 
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
+    const int dr = nr;
 
     // row range for this thread
-    const int ir0 = dr*ith;
+    const int ir0 = 0;
     const int ir1 = MIN(ir0 + dr, nr);
 
     ggml_dump_tensor(src0);
     ggml_dump_tensor(src1);
 
-#if 1 //naive algorithm, can works with llama-cli
+#if 1 //naive algorithm for fp32, can works with llama-cli
     float * a = (float*)src0->data;
     float * b = (float*)src1->data;
     float * c = (float*)dst->data;
@@ -473,9 +469,6 @@ static void ggml_compute_forward_mul_mat_one_chunk(
     const int64_t r2 = ne12 / ne02;
     const int64_t r3 = ne13 / ne03;
 
-    //printf("ir0_start = %6lld, ir0_end = %6lld, ir1_start = %6lld, ir1_end = %6lld\n", ir0_start, ir0_end, ir1_start, ir1_end);
-
-    // threads with no work simply yield (not sure if it helps)
     if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
        return;
     }
@@ -514,20 +507,12 @@ static void ggml_compute_forward_mul_mat_one_chunk(
 
                 const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
 
-                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
-                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
-                //       the original src1 data pointer, so we should index using the indices directly
-                // TODO: this is a bit of a hack, we should probably have a better way to handle this
                 const char * src1_col = (const char*)wdata +
                                         (src1_cont || src1->type != vec_dot_type
                                          ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
                                          : (i11 * nb11 + i12 * nb12 + i13 * nb13));
                 float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
 
-                //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
-                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
-                //}
-
                 for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
                     vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
                 }
@@ -574,7 +559,7 @@ int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_te
         int M = src0->ne[1];
         int K = src0->ne[0];
         int N = src1->ne[1];
-         float sum = 0;
+        float sum = 0;
         for (int i = 0; i < M; i++) {
             for (int j = 0; j < N; j++) {
                 sum = 0;
diff --git a/ggml/src/ggml-qnn/kernels/libggmlop_skel.so b/ggml/src/ggml-qnn/kernels/libggmlop_skel.so
deleted file mode 100755
index b92e38c950500e09eb9e76cd961c2de86de2898d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 13672
zcmeHOe|%fjb-#LY96Je)Y|B6Li$4E|ane|p9oun&15dIP+rcCzG1P!Rm1Rk`<;a%M
z4_Bs)xJhV%DJBZgu_f-}lrgrjR-ZM4tb-3P#$ka`ma;6%Xx*BS?NeaPX`F79c6i_O
zemKuc7_=R2|Hy^wbH4YSd+v{S-+f2-p6s+YZ_?>>Oig<BBvVlp3Wq8fi-1-!o~5u-
zwuI%*%GEmVJjN)`qXGky<ve&Lzs4~}ITb}D2Rkbxodh6dlBX#7D5#q615a6%xxuT+
z=j}8Nc!H)1lNtX3b$9pq`~gR3hsS3M`8QOVs!e89`7@w2S0c~Ya?nhc!LHM^kp<p}
zET%ISg<=*<XC=TiE*eiJW~vPIWE%b;86e*vv(jIxZ%M*m0zRbZlf6F#UIhEpKH-C)
zrG<=b0nKEK*>^$JMD~uW0F>|qG7o*4ThdQKKTpAaog`0ZMqo9O{xS{5qhvQ|Fv-n?
za~ZRNWU&RT23T0g*wYK-{<Q#?qQ6f_@^t1P`FzHF5}waIR9{JCuTO)iy)SDpy3LsS
zynWt2777P_o_@8+9SS)5BECLnn1%cfEE48$xWV-}``tbd>+|%5JYkgec%sg3f4>6-
zuE3zf=f%<)bU1z9?tYKk(dX@7TiH1n_Jqh-xK?6O%N2Dxy1e~RyVv6gg(IEXW?epi
z$fFj}QdAmsc>4$ZJ3Nj^(90a#!~Td5mV%ycZz${uI=X_+J`Ys7{Ekj%$P@ZM4BX@N
zx@brtuYb;X{Q-<0(}4eB`Ab1(09HL7Dhm5|I8cZ_dq3W=DxtY^y4?)D@nViFp^zs?
zb0t~qbNYOKSJE6Xyn8OZaYVaav)r7w5szz$lIhV=qGw8pmNe}jImq<fDbYTGr6Rpk
zi$LjJ%0R|)noU?%(%M}Nie<;J4KY1(KJwL)X8FvDvHU(UNqZNxvNf8S;1fJ!MWB<Q
zM-~2;pvM$GzksplHT<~3Q@v9PA4SJcEBp}nvkLzc@Rt<c20I31YO5J^jl$Q0Ql|1D
z&^iq-DEvN9s!uvCpp6RO0ZO*0JOH{!;SYg6rs1Dd_+y}yN&gh+^9nx+Dr)#~g(sbp
z8vZqfXLxbXEBpn}6kxJx0NtbTw8zE37bp+%5con`1V~R3uarMcJUw)zXNadofRrF!
zSzJenS03Oo;*|&fJn^(}kVN7+nI9*f9xl>J;<XD-E}s3%i6e*Dn-inl*^$wFA-#A1
z&Y8sz|2>Q5pItruPvF@Yi!w2PS|<+0(?p$t6Ywc0)t6R@?5-Ke51e4pWk1NDJ}~o(
z7nyh<o+{#%5D&F{Ff(&!n>Z3*Don($7q8#P#6R6ejqVVz*Pm+Y&9ELkw^cY8uM-c(
zGprdgCLWaJq@zuwpCLB&E*FR6q|;0I%_Mv}3BR6%PbT59%h*fyewc*MBw_SdZC}!V
zH3^Sj#@fCU&tXoVz&z<NkK9<nG|kx%=9h`wG{c-S5$=iT*p82E?|#e~kNGtS&VXV0
zrT2jEX6tpOX##6!;;B&vnee%ko2msE-rUs76kgZb)cYXnaw0y<)}c2RoMi2LcA_qx
zEzP}nGi#hD?839*ie-Lx{G-WW@l)1a@ejqy;#HQ^Sc#B2@PKts{A**o<2vD?NQ%%M
zcyMf2oU^C>?E2$;yFp0rKwoz@3A+z8i+c{FH!XV4@W$aIX@V;-UvLLj2@l2##IMEE
zAJV-yPv{KjPXF2Hs~rQG=<`k{?5Uq8?yjf$dOLfM30MmlOWK*kM+A&BXTi$xgm5Bj
z{(_8mzbfpGKO>G64~^}K+pMXv2K4(Iz?g&RF7$Ud`gdJJ+WU((rM;gnx&qC@qVZeD
zt{XQ9B~t&YEc0W_1#ch&`sL#Mfo!2Ku-u*gi;l_W%riSLy+~LGS%LUiWVvnmbSBo|
zLh+$^4)_fgzJ0Z@H{K#Xdf+RYGp7XMu}GE}3M{wfOdFw>DLx)&mfQ|rAiYQ9w~PA^
z47V(tBHe7%UnqtH1|b6dY)OB$xIfN@X#8yF?ZW;8nc|~y(qAZq182=g%v6ss6Ca6W
zSTf!-imd%1Aw5BL(q4}p$rV`}6CR0t5S?k4<`MHp^2KOdhxcZ9t8PltJ^jp)QZX%|
zv!`7Q|9qx>0qSw~)c18Fi#~Y&%-z;8E=Kd6bN%w0=3k$26V}Anv4W(OiRbrGVB*hL
zVXbiX&b>xFyOP$4(UoT=Mh#Y$_M8QCC0HXjyfHC)LC8DSZei`kZ4;wK*fU`BzJC|9
z4D9}y#paH6;Z*{@gU78|+!u`lrQmx`S&Iu>j4?gtB}MEEJbAKp`23Gk2F_dR-z0gd
zXf4(^R_#6K;07+*CgP{ZC*oW^H~lR5d%?SI<HmP&R%|ewn20|CT%B2!EwrV??2Yqc
zHF@i@#8VB!%z9$V(v})qms6R0yu~`asd0Xc-MrSQtABaw-uq|nq&1g<er5@}pBd0*
z+ZbnHmKBK-YX2o`7Oe&DRTjPU2QznfF8J=d9G;C?%sX~?ymgpuJ^aA4C%Ja6v32iC
z=r`j1q5ht?7zYTO#gyU)Y{tF%asBj4tA2{g*m3Jh*BKTq65{b<;EuPg1rdxR?+Ysu
z=U!paLg8_#+<eYj@I&?urnBTG-p)FgZENFVlbPJ+@s|2wdn1c6OI|`R*2GilPfR7y
zH`D1CM*k9&+4Jz6a}(S+?ICQZwW%iVd9^0)0=0nFx9MZW;7h>2g!!;Fro<R92do#y
zr)+KWVghW9w^)XS#?;tbkadaic#2h=s&CW9MBvd^sID+vGM0+D*2UN+_r)QU*N=~k
z>V*?iYXq*{0vmbFD-zCIYT{PZx1p}CIX6*Hcx(jr>xVfTi}Jwwf2fJ8V^Y^Kjmt*k
z5!lZBGcSzRsbg!A$7XHCxc(XA0=0lj;{sm-es)~I9I!gB)!K1E_LO#9z-4Gh8W;AJ
zQQ)D|)GzcK<KytGI$y78pz%HL)U|gtK5qX!^sRU<1<<d*qb6Q_rY1gyaS52ydEhy_
zI5mXlyylG;Mu&pg|FZ%%n3K!N!}#@>o8mR+O#3h{JDzzibPPgAfH|we%b{00DP$S2
z4(o-HI2$`UT8QUX(4vp&$B&K{HR@wJiy?75i|b9?H*>e{CFAHbMD2zIvm51Sk9ni#
zk2Q^baU|Xd8%5|V-@?UifsMjOF1A@Tjy741qld>v;suJVdCVxuwpxs%TOha9v$k7N
zb`EQr*8LLnEnu-4^2CYJ6s&bU);^WBjaiGoK5pDQ?~dZ15;o)Atr^!(an{^v!iF*H
z%0kRVKKSU{)|HRxA7#~9mDx0ZYYjD-V=eY!;bdxzs~4w8$9$eEAet|5hHYZJfqR3S
zKDU$mqdclVX)UI?)HOHt)?wUC3<iuAmbeheJ4SK=K1Pn%5vY`8;lN?RkoA(8E5%%N
z<clnt3!NS-mzg3KHHh|<iFqHMfAQZO^9EAiv98=V$+y=I<tGXSE?gw)6Zv>1F$?Xp
z@_f?Xz-`8!P^T+>_sdt`6P`J#llFuhw8Lxm1j1=adx8;VZJIrSaP3w01S(Ja750Ru
z&^Ou>4uNX-gx|$}P%P+vz5U>LUV`=k0b|I=ST?9*QT707f5UtFyV?hiIoYqZ4;*x6
z&DjI27+3Nh5SwiIbnpMY%KK0Bvv~ip_UZk{yBJM_-yqsjCh9)m+gs5dFXksM2`p=&
zd4%T#W893jj`uIR9$Ek4xfkDd=m#?JPKkDIf_%>7Hqd|N+n2)5Iw3E?{Y&<Z9k9dj
z{bzS}TwLF=By4Ep+Y2vLA5iQ6=R3iblki=aLr;WVvd^9U&W%RAm&ZhwbsgRd@^34w
ztnORKSzW@stITXM)_qGf)|mxk-O!k^t_F3>(;j(;LtTvHz75zP7NK5FTT`zNd$9@f
z!=<{?#VC8rHWYdJ75LM_l1N?$3mehzv?jhC{$g7J%F@LVs{oo08LhuG)IGa#S6PAZ
z$C33Ne;dw1-LsPc!vg4&Txm0^!}>)T(|xh2S2P!{`Hw*wH`}=seOiVrpid;v+0xXz
zM8p_T#_&y;gr2r<QK@gsB1_PImeMzj1$|o<(Mf$<7FjyU!q21L*^Py33WNiZ8$13k
zoR4~MP6o2D2g-YyvY$~q*q@`+4~Fj<`@-Nw^I5(<EcGuG2uL&>7zaIMe#_K(TP}R*
zjzAs0PseUbYb=Sfp`sRDZ0Nr-Z!GCyL&a?@R$wf7uAqg*c=#zLIYrMFWEFphN4q+a
z8`a&JchPt!p7F_%dOf~vub-<dD8%nT;?u^gJi9A;ar}ds_9~@)v(i2j0s(`4B7PBm
z-f8%M&w@SyN`9c$Uu8)Nb8?9NU}SVLem{K=zpWPFx7Ff*TrK_1TJ|#iHeEb@KwOqc
z{%yJp{h{AR>hDtJ7|w-*Ba>^x+O@U(YUP|wZcvfaxsr57wT6{T=Qa3s%~r6R*o~}~
z&Bv)t_8hck4l@4=x;Nm&+JciKi!xs2Ah65l><%&ccrFxlRWP?R>}27=0FKClE;Ez*
z<_?7!9e&X%oj<_(J6XSn^~i=CUbheDZX4BUq*Lb#0Fzp9jb599Ycp_d2CmJ(wHdfJ
z1J`EY_k0F)=-<QmLv%m>WPlz7Rqy5K%7yM{rF-pM`K~RHBHz{1T`ygYkS^Wbi5$a?
zFJs?ExgK<1p=38e{Lg1*{3}4|4iDy}d%CAn<vYGTN|^v7Y2bTMpYHvr?Hpunn&cV#
zC31d&oY58O4o=mfyEM8(qq{RoI>s^5p@i;9*@n7w&#jJ;WPnnp`!M1er<d;OTfkEP
zDc3@lMjb>N0#A7#atm??asXKoOFAiNnA-l4_*XxY3-n2t;l~DJhft4FE-KH(dWI>I
z?%b%HY*ByGQqrXRw@K8ad`ii1k0l+bNMqnofd_v9a;i&xSNo>}<`ir|Jxb|NB0Wly
z96-21#$8^YXG3{;`4tfsH>(%$SKM-!IQsl<@|6*rQKI_Is=q8-mH!@6JEbjD_yiWq
zr_JoPNK-xE+<eEae4~7)&)55${oTBBQO?(zYE9;vPgHC6_V4fn`HFQWGhbTM)#a|N
zF|V(z_Ec4MI@fkN-DXckh1*<H?W#4KE2=BkR##Q6?^^ZAW~pOjl+1QkxlpaMa$Uu`
zwUu4fRnGNQ)t<`E+A2>)mvepBdbg`{ZN<7;*D9XX`}_J3vaq4t9|)J<<MQ{FE2DF4
zx7|`+W2&T|Amw4d-xn%xw6|Frwr+8>N=P1WxePkJ{pCuiLzxdF<jcx>oS~kwP<YVi
z*%%rO4e(`s&Y-JjqoM`q3Pe;iu&$af3-vgI9yed+LIi+A1r27t%+Fu3F!=w6{+dkX
zVP~-06IOdp>#rPLDQ_^B2fVIw3QyQ>^7CcQzlGL5`&K^V7CuAsSWRwES7@+rW7Bq<
z)zP}`_U)~9n`2Y6rD0=bl5o4d{`PJ5%d&odS-2<Yak@i%nH45F{XVY?btu?mqa19a
zB3gL<Ss8`4s1nrIiV(~rMey)tKB$NM{Rpkuc;y-`!}50q!)4t;e<Z+b4>9~&B!`n>
znU#6^-HP(us%1U|z08sS78YbV1@mC}vrR?)enj>l_=zHjoC8j;&)Mno6crVji;C)t
zXk3UEayi`ou%l~jWl`yX-|JpgRO($@xrPsfgKPMY<sev9R8ds)DVjjooqE0_nMMPW
zZTyi;$Q@C%q%#;*&0AA-r3Q{*G;*b%A8$}T;rWCb%+&7>dnitjq6V4CJ=l*0CsTw>
znP^93rl7~?B$2`ed|_si0-;P28}>v|AO%vHf_~`)$>iyg!?z?;&OWaT433t_6m(?3
zOs=p$7y^(<w;yKkN4AQMz0cE+61j)Ifj*O~yU$JGV9XTip$IN#C%C~rnacPopUl}J
zz2t4pLijMUH+57g(#xI>BY%#P?4977PY#-dWrn`LGL}Eo>i1eNd1;U+zLs9{{&3Gb
zi^)5LZ=}TM=p~QtvcIO6yiNEhivOWU{2`_2ES5dAp;??M$tf;~l3s}vBR^Bd+3aDN
zmHwu%pUymygpUAI|CWO4;JyAeXb&*8MEHB4<U`U(3BLtO{$!_ur$EWKv?=(%LCqK+
z`KzS=5ojYYr{MW;q}1_~N4qo$e;zo&tL<m7THs0GR;hjl6M)I@%vUhQPm;e$exMHX
zcQ-KkkE*;I_?QL<fyqBq^(hWjonLD8tH2K`^G6Y&j{>XnspGQQUjVC#=IC#d@Jkw;
z1^KH<`1?utY!d!)68=dNz5q;fNy&icsRPjBYyeu%_aRd}?NVS`zdqnBHlOjp>UyR4
z*eYOcysZUT8}E7>ur{8y6IdIsdJnLOhfVWH_I3jcSmtVc>j*Ht541ig{`H%r&tHxY
z{x&eJcY5V0-u4WzHeUA<@O_X|GH?c_!w(O2{Tn#c0RFxz$9&|_4-+NP{NJL%q;{7E
zll=E5X+O3uHSUz|59E(6Hqd4hg|V8N9geO@zsun;DOJ>u3_^IrJ%0E0aM0V|9kK+2
z&cV>!GKza8*=1W~(o~7qRePgj(>BYkc8As8(6q(j&=%RZ*iax_pJO(}0!|9jMYOI`
zK8r)tR>T!<#a%*3x=~Ox=yD+h>-DW$w>g@dwzo=c*$Bz)I2xL_S}o0vt(!J&x3@Z4
zE!Ji`b?0*Q?%cRrv_nzOYW%VsMvOxIuZi<jLUtu@Oyh8|E8Mf$;AlD2_bP$SbHb*R
zpMeI?iF&4oM3*QUxpD`ilw27WofMo*5#1lF>$qA3^pz&%NNaiSmDzSkQ$PD4;Lu^N
zm7hGnK@2#Ie0(7HZ~oTNwn(30Y!7q(U4&{``GiYfQ2OeU9-Yrn-iJ)v0NEft+Q;dW
zOl71e;typyw^5VIehvk6wnKX4Cs016U=Yb3*<7jcq$i*PWje<pOo?>pJf~dYshrM#
zDDMGJn7+BH-g@xV2Ri?t{aoGC0j0z%^kz_1Pdz7?RAfYGtdwlXsvdn0C^t%nT9TOY
zGGUuFdXitG?B$AhR<A|TqkfY=L7Aflltg2pMCUeCOnT%8P^K}eiC36TP#UYWT~i@)
zLdBp|hivXZCVQkuegoyeRoab0uIkbEhVmUKQ<K`i`%$3kkzYVrm65L6*9SCu^gW_X
z&r?mRUJM1QJ@V5it1{A6`M5@p{0++VZ%)*t_Tf)ZK=#av;5F!ds2WjZB>yI|;-*TP
zFW<n}DiSD3WuF6<=2$78D#iCyRRUG@j)Uhk<>D&z0qv5XRf0@iptbc1vew>(GR6ip
I1**OO1x>QY9{>OV

diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 4675c5fcad307..d3c47e0473bcd 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -33,6 +33,15 @@ function show_pwd()
 }
 
 
+function check_hexagon_sdk()
+{
+    if [ ! -d ${HEXAGON_SDK_PATH} ]; then
+        echo -e "HEXAGON_SDK_PATH ${HEXAGON_SDK_PATH} not exist, pls install it accordingly...\n"
+        exit 0
+    fi
+}
+
+
 function check_and_download_qnn_sdk()
 {
     is_qnn_sdk_exist=1
@@ -98,6 +107,16 @@ function check_and_download_ndk()
 }
 
 
+function build_dsp
+{
+    cd ggml/src/ggml-qnn/kernels/
+    show_pwd
+    make clean
+    make
+    cd -
+}
+
+
 function build_arm64
 {
     cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH}
@@ -106,6 +125,8 @@ function build_arm64
     show_pwd
 
     cd -
+
+    build_dsp
 }
 
 
@@ -158,6 +179,7 @@ function build_ggml_qnn()
     show_pwd
     check_and_download_ndk
     check_and_download_qnn_sdk
+    check_hexagon_sdk
     dump_vars
     remove_temp_dir
     build_arm64
@@ -314,6 +336,7 @@ show_pwd
 
 check_and_download_ndk
 check_and_download_qnn_sdk
+check_hexagon_sdk
 
 if [ $# == 0 ]; then
     show_usage

From 5024133579a0d9a1ff20cb30bc0e76fe97a7b823 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 25 Mar 2025 09:30:19 +0800
Subject: [PATCH 133/200] ggml-qnn: remove redundant functions in this PR and
 make codes more clear

---
 ggml/include/ggml-qnn.h          |   4 +-
 ggml/src/ggml-qnn/CMakeLists.txt |   2 -
 ggml/src/ggml-qnn/ggml-qnn.cpp   | 391 +++++++------------------------
 3 files changed, 82 insertions(+), 315 deletions(-)

diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-qnn.h
index 2ff2bef9dcf7d..63d136c5e52b9 100644
--- a/ggml/include/ggml-qnn.h
+++ b/ggml/include/ggml-qnn.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2023-2024 The ggml authors
+ * Copyright (c) 2023-2025 The ggml authors
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
@@ -29,7 +29,7 @@ extern "C" {
 #endif
 
 #define GGML_QNN_MAX_DEVICES    3
-#define GGML_QNN_BACKEND_NAME   "qnn"
+#define GGML_QNN_BACKEND_NAME   "hexagon"
 
 enum QNNBackend {
     QNN_BACKEND_CPU,
diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt
index d5a16ffd4e1a1..4fb3a8b6d4b47 100644
--- a/ggml/src/ggml-qnn/CMakeLists.txt
+++ b/ggml/src/ggml-qnn/CMakeLists.txt
@@ -31,8 +31,6 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/incs)
     include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc)
     include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/remote/ship/android_Debug_aarch64)
-    include_directories(${HEXAGON_SDK_PATH}/incs/qnx)
-    include_directories(${HEXAGON_SDK_PATH}/libs/common/qnx/ship/android_Debug_aarch64)
     include_directories(${HEXAGON_SDK_PATH}/utils/examples)
     include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/android_aarch64)
     include_directories(${HEXAGON_SDK_PATH}/libs/atomic/inc)
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 847191b7dbc0c..fd1ee54a8cf28 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -1,16 +1,11 @@
 /*
- * Copyright (c) 2023-2024 The ggml authors
+ * Copyright (c) 2023-2025 The ggml authors
  *
  * Qualcomm QNN SDK and reference tech guides could be found at:
  * https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
  * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
  *
- * there are three tech approaches to implement the ggml-hexagon backend for Qualcomm's Hexagon NPU:
- * - general approach through Qualcomm QNN SDK:offload ggml op to QNN, then QNN will transfer to Hexagon cDSP
- * - general approach through Qualcomm Hexagon SDK:offload ggml op to Hexagon cDSP directly
- * - special approach through Qualcomm QNN SDK:mapping the entire ggml cgraph to a single QNN graph
- *
- * this single-source-file or self-contained implementation of ggml-hexagon backend has 10 sections:
+ * this single-source-file or self-contained implementation of ggml-hexagon backend has 9 sections:
  * section-1  forward/prototype declaration, global vars, macros, data structures
  * section-2  ggml-qnn internal troubleshooting function/class
  * section-3  helper function for WoA(Windows on ARM)
@@ -20,7 +15,6 @@
  * section-7  backend helper function / class
  * section-8  implementation of ggml-hexagon backend according to specification in ggml backend subsystem
  * section-9  implementation of general approach through QNN and Hexagon DSP
- * section-10 implementation of special approach through QNN:mapping the entire ggml cgraph to a single QNN graph
  *
  * currently provide following ggml op' implementation through QNN:
  * - GGML_OP_ADD/GGML_OP_SUB/GGML_OP_MUL/GGML_OP_DIV/GGML_OP_LOG/GGML_OP_SQRT:
@@ -136,60 +130,34 @@ class  qnn_instance;
 struct qnn_parameter;
 struct ggml_backend_qnn_context;
 
-typedef int  (*pfn_mallopt)(int, int);
-typedef int  (*pfn_android_mallopt)(int, void *, size_t);
-typedef void (* ggmlqnn_op_func_t)(ggml_backend_qnn_context * ctx, ggml_tensor * op);
-typedef int  (* notify_callback_fn)(void * context, int domain, int session, remote_rpc_status_flags_t status);
-typedef int  (* ggmlhexagon_op_func_t)(remote_handle64 handle, const dsptensor * src0, const dsptensor * src1, dsptensor * dst);
-
-static void *           ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op);
-static void             ggmlqnn_dump_tensor(const ggml_tensor * tensor, const char * name);
-static enum ggml_status ggmlqnn_backend_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph * cgraph);
-static void             ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
-static inline bool      ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
-static Qnn_Tensor_t *   ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
-                                                     const ggml_tensor * tensor, const char * name,
-                                                     Qnn_TensorType_t qnn_tensor_type,
-                                                     Qnn_DataType_t qnn_data_type,
-                                                     uint32_t rank, uint32_t * dims,
-                                                     void * data, uint32_t data_size,
-                                                     bool b_transpose = false);
-
-
-//function prototypes for all op functions in the general approach
-//general op function for elment-wise operation on 1/2 input tensors and 1 output tensor
-static void ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-
-//todo by AI experts
-static void ggmlqnn_compute_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void ggmlqnn_compute_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value);
-
-//function prototypes for all op functions in the special approach("mapping the entire cgraph to a single QNN graph")
-static void ggmlqnn_graph_addnode(ggml_backend_qnn_context * ctx, struct ggml_cgraph * cgraph,
-                Qnn_GraphHandle_t graph_handle, std::string & graph_name, ggml_tensor * op, bool is_reuse_graph = false);
+static void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op);
+static void   ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+
+static void   ggmlqnn_compute_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
+static void   ggmlqnn_compute_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value);
 
 #if 0//def NDEBUG
 #define GGMLQNN_DEBUG                                   0
@@ -283,17 +251,14 @@ using _pfn_QnnSaver_initialize                  = decltype(QnnSaver_initialize);
 using _pfn_QnnInterface_getProviders            = decltype(QnnInterface_getProviders);
 using _pfn_QnnSystemInterface_getProviders      = decltype(QnnSystemInterface_getProviders);
 
-//QNN resource management for the general approach through QNN(similar to ggml-sycl or ggml-cann)
+//QNN resource management for the general approach through QNN
+using qnn_tensors_t                             = std::vector< Qnn_Tensor_t >;
 using qnn_ptensors_t                            = std::vector< Qnn_Tensor_t *>;
 using qnn_singlenode_res_t                      = std::tuple<Qnn_GraphHandle_t, qnn_ptensors_t>;
 
-//QNN resource management for the special approach through QNN(mapping the entire cgraph to a single QNN graph)
-using qnn_tensors_t                             = std::vector< Qnn_Tensor_t >;
-using qnn_tensor_pair_t                         = std::tuple< ggml_tensor *, Qnn_Tensor_t *>;
-using qnn_tensor_pairs_t                        = std::vector< qnn_tensor_pair_t >;
-using qnn_cgraph_node_t                         = std::tuple<std::string, qnn_tensor_pairs_t>;
-using qnn_cgraph_nodes_t                        = std::vector<qnn_cgraph_node_t>;
-using qnn_multinode_res_t                       = std::tuple<Qnn_GraphHandle_t, qnn_cgraph_nodes_t, qnn_ptensors_t, qnn_tensors_t, qnn_tensors_t>;
+typedef void (* ggmlqnn_op_func_t)(ggml_backend_qnn_context * ctx, ggml_tensor * op);
+typedef int  (* notify_callback_fn)(void * context, int domain, int session, remote_rpc_status_flags_t status);
+typedef int  (* ggmlhexagon_op_func_t)(remote_handle64 handle, const dsptensor * src0, const dsptensor * src1, dsptensor * dst);
 
 enum qnn_index_type {
     QNN_TENSOR_INDEX = 0,
@@ -306,9 +271,9 @@ enum qnn_profile_level {
     PROFILE_DETAIL  = 2,
 };
 
-//0: general approach through QNN
-//1: general approach through Hexagon cDSP
-//2: special approach through QNN:mapping entire ggml cgraph to QNN graph
+//0: general approach through QNN:offload ggmlop to QNN
+//1: general approach through Hexagon cDSP:offload ggmlop to Hexagon cDSP directly
+//2: special approach through QNN:mapping entire ggml cgraph to a single QNN graph
 enum inference_approach {
     QNN_GENERAL     = 0,
     DIRECT_USE_CDSP = 1,
@@ -357,7 +322,6 @@ struct qcom_socinfo {
 
 struct ggml_backend_qnn_context {
     int device;
-    int threads;
     char name[GGML_MAX_NAME];
     char desc[GGML_MAX_NAME];
     char lib[GGML_MAX_NAME];
@@ -367,10 +331,8 @@ struct ggml_backend_qnn_context {
     QNN_SYSTEM_INTERFACE_VER_TYPE raw_system_interface;
     struct qcom_socinfo           socinfo;
 
-    //QNN resource management for the general approach through QNN(similar to ggml-sycl or ggml-opencl)
+    //QNN resource management for the general approach through QNN
     std::map<std::string, qnn_singlenode_res_t> qnn_singlenode_graph_map;
-    //QNN resource management for the special approach through QNN(mapping the entire cgraph to a single QNN graph)
-    std::map<std::string, qnn_multinode_res_t> qnn_multinode_graph_map;
 
     //quantize data -> fp32
     std::unique_ptr<char[]> work_data;
@@ -379,7 +341,7 @@ struct ggml_backend_qnn_context {
     size_t desired_size;
     int n_threads;
 
-    //hexagon resource management for the general approach through Hexagaon cDSP(similar to ggml-sycl or ggml-opencl)
+    //Hexagon resource management for the general approach through Hexagaon cDSP
     size_t rpc_mempool_len;
     void * rpc_mempool;
     remote_handle64 ggmlop_handle;
@@ -516,7 +478,6 @@ static struct qcom_socinfo g_qnn_soc_info_table[] = {
 // HTA - Choose a quantized model. Quantized models are required when running on the HTA backend
 static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
         [QNN_BACKEND_CPU] = {.device               = 0,
-                .threads              = 1,
                 .name                 = "qnn-cpu",
                 .desc                 = "Qualcomm Kryo CPU",
 #if !defined(__ANDROID__) && !defined(__linux__)
@@ -531,7 +492,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .socinfo              = {}},
 
         [QNN_BACKEND_GPU] = {.device               = 1,
-                .threads              = 1,
                 .name                 = "qnn-gpu",
                 .desc                 = "Qualcomm Adreno GPU",
 #if !defined(__ANDROID__) && !defined(__linux__)
@@ -546,7 +506,6 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .socinfo              = {}},
 
         [QNN_BACKEND_NPU] = {.device               = 2,
-                .threads              = 1,
                 .name                 = "qnn-npu",
                 .desc                 = "Qualcomm NPU(Hexagon Tensor Processor)",
 #if !defined(__ANDROID__) && !defined(__linux__)
@@ -996,36 +955,6 @@ static void ggmlqnn_get_timestring(char * p_currenttime) {
              p_tm->tm_hour, p_tm->tm_min, p_tm->tm_sec);
 }
 
-//fix some tricky memory issue
-static void ggmlqnn_disable_android_tags(int disable) {
-    if (0 == disable)
-        return;
-#if defined(__ANDROID__)
-    void * lib_handle = dlopen("libc.so", RTLD_LAZY);
-    if (nullptr != lib_handle) {
-        int api_level = android_get_device_api_level();
-        GGMLQNN_LOG_INFO("device_api_level=%d", api_level);
-        if (api_level >= 31) { //ANDROID 12
-            pfn_mallopt mallopt = reinterpret_cast<pfn_mallopt>(dlsym(lib_handle, "mallopt"));
-            if (mallopt) {
-                mallopt(M_BIONIC_SET_HEAP_TAGGING_LEVEL, M_HEAP_TAGGING_LEVEL_NONE);
-            }
-            return;
-        } else if (api_level >= 30) { //ANDROID 11
-            /* android_get_device_api_level() < 31 */
-            pfn_android_mallopt android_mallopt = reinterpret_cast<pfn_android_mallopt>(dlsym(
-                    lib_handle, "android_mallopt"));
-            if (android_mallopt) {
-                int android_malloc_tag_level = 0;
-                int tmp = 0;
-                android_mallopt(8, &tmp, sizeof(tmp));
-            }
-        }
-        dlclose(lib_handle);
-    }
-#endif
-}
-
 // =================================================================================================
 //  section-5: QNN helper function
 // =================================================================================================
@@ -2876,8 +2805,6 @@ class qnn_instance {
 
     void * alloc_rpcmem_internal(size_t bytes, size_t alignment);
 
-    void htp_print_info();
-
     void htp_probe_rpc_meminfo();
 
     void print_backend_info();
@@ -3500,7 +3427,6 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
 
     Qnn_ErrorHandle_t qnnstatus = QNN_SUCCESS;
     if (_device_id == QNN_BACKEND_NPU) {
-        //TODO: remove duplicated code between here and function htp_print_info
         const QnnDevice_PlatformInfo_t * p_info = nullptr;
         qcom_socinfo soc_info = {};
         qnnstatus = _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
@@ -3618,23 +3544,13 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
     }
 
     if (_backend_name.find("Htp") != std::string::npos) {
-        htp_print_info();
         htp_probe_rpc_meminfo();
 
         if (0 != htp_init_perfinfra()) {
             GGMLQNN_LOG_WARN("initialize HTP performance failure");
         }
-#if 1
-        //FIXME: ht_set_rpc_polling + htp_set_high_performance_mode should be equivalent to htp_enter_performance_mode
-        if (0 != htp_set_rpc_polling()) {
-            GGMLQNN_LOG_WARN("set RPC polling failure");
-        }
-        if (0 != htp_set_high_performance_mode()) {
-            GGMLQNN_LOG_WARN("set HTP high performance mode failure");
-        }
-#else
+
         htp_enter_performance_mode();
-#endif
         htp_set_memory_grow_size();
 
         if (enable_qnn_rpc()) {
@@ -3875,37 +3791,6 @@ int qnn_instance::htp_init_perfinfra() {
     return 0;
 }
 
-void qnn_instance::htp_print_info() {
-    const QnnDevice_PlatformInfo_t * p_info = nullptr;
-    _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
-    GGMLQNN_LOG_INFO("HTP device counts %d", p_info->v1.numHwDevices);
-    QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
-    for (size_t i = 0; i < p_info->v1.numHwDevices; i++) {
-        GGMLQNN_LOG_INFO("HTP deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId,
-                         infos[i].v1.deviceType, infos[i].v1.numCores);
-        QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
-        QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice;
-        QnnHtpDevice_Arch_t htp_arch = chipinfo.arch;
-        GGMLQNN_LOG_INFO("HTP_TYPE:%d(%s)", devinfo->devType,
-                         (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "QNN_HTP_DEVICE_TYPE_ON_CHIP" : "QNN_HTP_DEVICE_TYPE_UNKNOWN");
-        GGMLQNN_LOG_INFO("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB，" \
-                             "dlbc_support:%d, signedpd_support:%d", \
-                             chipinfo.socModel, ggmlqnn_get_socmodel_desc(chipinfo.socModel), \
-                             htp_arch, ggmlqnn_get_htparch_desc(htp_arch), chipinfo.vtcmSize, \
-                             chipinfo.dlbcSupport, chipinfo.signedPdSupport);
-        struct qcom_socinfo * socinfo = ggmlqnn_get_socinfo_from_socmodel(chipinfo.socModel);
-        g_qnn_mgr[QNN_BACKEND_NPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {}};
-        if (nullptr != socinfo) {
-            memcpy(g_qnn_mgr[QNN_BACKEND_NPU].socinfo.soc_desc, socinfo->soc_desc, sizeof(socinfo->soc_desc));
-            GGMLQNN_LOG_INFO("soc info:%s", socinfo->soc_desc);
-        } else {
-            memcpy(g_qnn_mgr[QNN_BACKEND_NPU].socinfo.soc_desc, "unknown", 7);
-            GGMLQNN_LOG_INFO("soc info:unknown");
-        }
-    }
-    _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
-}
-
 void qnn_instance::htp_probe_rpc_meminfo() {
     size_t candidate_size   = 0;
     uint8_t * rpc_buffer    = nullptr;
@@ -3998,58 +3883,6 @@ void qnn_instance::htp_set_n_hvx_threads(size_t n_threads) {
     }
 }
 
-int qnn_instance::htp_set_rpc_polling() {
-    if (_qnn_rpc_pollingtime > 0) {
-        QnnHtpPerfInfrastructure_PowerConfig_t rpc_pollingtime;
-        memset(&rpc_pollingtime, 0, sizeof(rpc_pollingtime));
-        rpc_pollingtime.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME;
-        rpc_pollingtime.rpcPollingTimeConfig = _qnn_rpc_pollingtime;
-        const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&rpc_pollingtime, nullptr};
-        if (_qnn_htp_perfinfra) {
-            _qnn_htp_perfinfra->setPowerConfig(_qnn_htp_powerconfig_id, power_configs);
-        }
-    }
-    return 0;
-}
-
-int qnn_instance::htp_set_high_performance_mode() {
-    if (nullptr == _qnn_htp_perfinfra) {
-        GGMLQNN_LOG_DEBUG("perf intra is null\n");
-        return 1;
-    }
-
-    QnnHtpPerfInfrastructure_PowerConfig_t power_config;
-    memset(&power_config, 0, sizeof(power_config));
-    power_config.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3;
-    power_config.dcvsV3Config.dcvsEnable = 0;
-    power_config.dcvsV3Config.setDcvsEnable = 1;
-    power_config.dcvsV3Config.contextId = _qnn_htp_powerconfig_id;
-    power_config.dcvsV3Config.powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE;
-    power_config.dcvsV3Config.setSleepLatency = 1; // True to consider Latency parameter otherwise False
-    power_config.dcvsV3Config.setBusParams = 1; // True to consider Bus parameter otherwise False
-    power_config.dcvsV3Config.setCoreParams = 1; // True to consider Core parameter otherwise False
-    power_config.dcvsV3Config.sleepDisable = 0; // True to consider sleep/LPM modes, False to enable
-    power_config.dcvsV3Config.setSleepDisable = 0; // True to consider sleep disable/enable parameter otherwise False
-    // set Sleep latency parameter
-    uint32_t latencyValue = 40;
-    power_config.dcvsV3Config.sleepLatency = latencyValue; // range 40-2000 micro sec
-    // set Bus Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
-    power_config.dcvsV3Config.busVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    power_config.dcvsV3Config.busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    power_config.dcvsV3Config.busVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    // set Core Clock Parameters (refer QnnHtpPerfInfrastructure_VoltageCorner_t enum)
-    power_config.dcvsV3Config.coreVoltageCornerMin = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    power_config.dcvsV3Config.coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    power_config.dcvsV3Config.coreVoltageCornerMax = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER;
-    // set power config with different performance parameters
-    const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {&power_config, nullptr};
-
-    _qnn_htp_perfinfra->setPowerConfig(_qnn_htp_powerconfig_id, power_configs);
-
-    return 0;
-}
-
-//TODO: merge code between this function and htp_set_rpc_polling,htp_set_high_performance_mode
 void qnn_instance::htp_enter_performance_mode() {
     QnnHtpPerfInfrastructure_PowerConfig_t dcvs_v3_config = {
             .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3,
@@ -4194,6 +4027,13 @@ static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * p
     return opcfg;
 }
 
+static Qnn_Tensor_t *   ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
+                                                      const ggml_tensor * tensor, const char * name,
+                                                      Qnn_TensorType_t qnn_tensor_type,
+                                                      Qnn_DataType_t qnn_data_type,
+                                                      uint32_t rank, uint32_t * dims,
+                                                      void * data, uint32_t data_size,
+                                                      bool b_transpose = false);
 static Qnn_Tensor_t * ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
                                                     const ggml_tensor * tensor, const char * name,
                                                     Qnn_TensorType_t qnn_tensor_type,
@@ -4325,8 +4165,6 @@ static void ggmlqnn_load_cfg() {
     memset(time_string, 0, GGML_QNN_TMPBUF_LEN);
     ggmlqnn_get_timestring(time_string);
     GGMLQNN_LOG_DEBUG("program running start time:%s", time_string);
-    ggmlqnn_disable_android_tags(0);
-
     std::string cfg_filename = std::string(g_qnn_params.qnn_runtimelib_path) + std::string(g_qnn_params.qnn_cfgfilename);
     GGMLQNN_LOG_INFO("load ggml-qnn config from %s", cfg_filename.c_str());
     qnn_cfg qnncfg_instance;
@@ -4393,30 +4231,46 @@ static bool ggmlhexagon_can_handle_op(const ggml_backend_qnn_context * ctx, cons
     if (nullptr != src0) {
         src0_rank = ggml_n_dims(src0);
     } else {
-        GGMLQNN_LOG_DEBUG("op name %s\n", ggml_op_name(op_tensor->op));
+        //GGMLQNN_LOG_DEBUG("op name %s\n", ggml_op_name(op_tensor->op));
     }
     if (nullptr != src1) {
         src1_rank = ggml_n_dims(src1);
     } else {
-        GGMLQNN_LOG_DEBUG("op name %s\n", ggml_op_name(op_tensor->op));
+        //GGMLQNN_LOG_DEBUG("op name %s\n", ggml_op_name(op_tensor->op));
     }
 
-    //FIXME: mulmat on cDSP doesn't work as expected
-    bool support =  ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
+    //TODO: remove this filter in the future, mulmat on cDSP doesn't work as expected
+    //bool support =  ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
+    bool support =  (op_tensor->op == GGML_OP_ADD);
     if (!support)
         return false;
 
-    ggmlqnn_dump_op_info(op_tensor);
-    if (!ggml_are_same_shape(src0, src1)) {
-        return false;
-    }
+    switch (op_tensor->op) {
+        case GGML_OP_ADD:
+        {
+            if (!ggml_are_same_shape(src0, src1)) {
+                return false;
+            }
+            return ggmlqnn_same_types(ctx, op_tensor);
+        }
 
-    support = ggmlqnn_same_types(ctx, op_tensor);
-    if (!support) {
-        return false;
-    }
+        case GGML_OP_MUL_MAT:
+        {
+            ggmlqnn_dump_op_info(op_tensor);
+            if (src0_rank != src1_rank)
+                return false;
 
-    return (src0_rank <= 2);
+            //TODO: remove this filter in the future
+            if (src0_rank != 2)
+                return false;
+
+            return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type))
+                       && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+
+        }
+        default:
+            return false;
+    }
 }
 
 static bool ggmlqnn_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * op_tensor) {
@@ -4425,13 +4279,7 @@ static bool ggmlqnn_can_handle_op(const ggml_backend_qnn_context * ctx, const st
     }
 
     if (DIRECT_USE_CDSP == g_qnn_params.inference_approach) {
-        //return ggmlhexagon_can_handle_op(ctx, op_tensor);
-        //bool support =  ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
-        //FIXME: mulmat on cDSP doesn't work as expected
-        bool support =  (op_tensor->op == GGML_OP_ADD);
-        if (!support)
-            return false;
-
+        return ggmlhexagon_can_handle_op(ctx, op_tensor);
     }
 
     if (!ggmlqnn_k_op_caps[ggmlqnn_get_op_index(op_tensor)].supported) {
@@ -4640,7 +4488,6 @@ static bool ggmlqnn_compute_forward(ggml_backend_t backend, struct ggml_tensor *
     return true;
 }
 
-//TODO: refine this data structure
 struct ggml_backend_qnn_buffer_context {
     ~ggml_backend_qnn_buffer_context() {
         if (buffer) {
@@ -4814,20 +4661,6 @@ static void ggml_backend_qnn_free(ggml_backend_t backend) {
         }
         ctx->qnn_singlenode_graph_map.clear();
 
-        std::map<std::string, qnn_multinode_res_t>::iterator multinode_graph_it;
-        for (multinode_graph_it = ctx->qnn_multinode_graph_map.begin();
-             multinode_graph_it != ctx->qnn_multinode_graph_map.end(); multinode_graph_it++) {
-            auto & graph_res = multinode_graph_it->second;
-            Qnn_GraphHandle_t & graph_handle    = std::get<0>(graph_res);
-            qnn_ptensors_t   &  ptensors        = std::get<2>(graph_res);
-            for (auto tensor_it = ptensors.begin(); tensor_it != ptensors.end(); ++tensor_it) {
-                free_qnn_tensor(*tensor_it);
-            }
-            GGML_UNUSED(graph_handle);
-            GGMLQNN_LOG_DEBUG("clean up graph:%s", multinode_graph_it->first.c_str());
-        }
-        ctx->qnn_multinode_graph_map.clear();
-
         instance->qnn_finalize();
         delete instance;
         g_qnn_mgr[ctx->device].instance = nullptr;
@@ -5071,7 +4904,7 @@ void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) {
     GGML_ASSERT(ggml_backend_is_qnn(backend));
 
     struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context;
-    ctx->threads = n_threads;
+    ctx->n_threads = n_threads;
 }
 
 int ggml_backend_qnn_get_device_count() {
@@ -5260,11 +5093,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
     if (nullptr == instance)
         return nullptr;
 
-    if (QNN_SINGLEGRAPH == g_qnn_params.inference_approach) {
-        ggml_backend_qnn_interface.graph_compute = ggmlqnn_backend_graph_compute_special;
-    } else {
-        ggml_backend_qnn_interface.graph_compute = ggmlqnn_backend_graph_compute_general;
-    }
+    ggml_backend_qnn_interface.graph_compute = ggmlqnn_backend_graph_compute_general;
 
     ggml_backend_t qnn_backend = new ggml_backend{
             /* .guid      = */ ggml_backend_qnn_guid(),
@@ -6003,63 +5832,3 @@ static void ggmlqnn_compute_rope(ggml_backend_qnn_context * ctx, ggml_tensor * d
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
 }
-
-// =================================================================================================
-//  section-10: special approach: mapping ggml computational cgraph to QNN graph
-// =================================================================================================
-// TODO: remove duplicated codes between section-9 and section-10
-// TODO: the graph algorithm in this section is naive, should optimized by AI experts
-// details: https://github.com/ggml-org/llama.cpp/pull/12326#issuecomment-2712838649
-// ref:     https://github.com/kantv-ai/kantv/blob/kantv-poc-with-qnn/core/ggml/jni/Inception_v3.cpp#L20634
-static enum ggml_status ggmlqnn_backend_graph_compute_special(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    enum ggml_status ggml_result                = GGML_STATUS_SUCCESS;
-    Qnn_ErrorHandle_t qnn_error                 = QNN_SUCCESS;
-    qnn_perf op_perf                            = qnn_perf("ggmlqnn_backend_graph_compute_special");
-    qnn_instance * instance                     = nullptr;
-    Qnn_GraphHandle_t graph_handle              = nullptr;
-    ggml_backend_qnn_context * ctx              = (ggml_backend_qnn_context *) backend->context;
-    instance                                    = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-    op_perf.start();
-
-    //now we got the entire ggml cgraph or a ggml cgraph which contains multiple nodes
-    GGMLQNN_LOG_DEBUG("qnn device %d(%s)", ctx->device, ggml_backend_qnn_get_devname(ctx->device));
-    GGMLQNN_LOG_DEBUG("cgraph->n_nodes %d", cgraph->n_nodes);
-    int num_nodes = std::min(5, cgraph->n_nodes);
-    //for (int i = 0; i < cgraph->n_nodes; i++) {
-    for (int i = 0; i < num_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
-        GGMLQNN_LOG_DEBUG("%s: op %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
-    }
-
-    //now we'll offload the ggml cgraph to a single QNN graph
-    std::string graph_name;
-    ggmlqnn_get_graphkey_from_cgraph(cgraph, graph_name);
-    if (graph_name == "")
-        return GGML_STATUS_SUCCESS;
-    if (ctx->qnn_multinode_graph_map.find(graph_name) != ctx->qnn_multinode_graph_map.end()) {
-        GGMLQNN_LOG_DEBUG("graph name %s already create", graph_name.c_str());
-        //retrieve computational resource from cached QNN graph
-        qnn_multinode_res_t &graph_res = ctx->qnn_multinode_graph_map[graph_name];
-        graph_handle = std::get<0>(graph_res);
-    } else {
-        //create QNN graph
-        GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str());
-        qnn_error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device), 8, 4);
-        if (QNN_SUCCESS != qnn_error) {
-            GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d(%s)\n", graph_name.c_str(), qnn_error,
-                             ggmlqnn_get_qnnerror_string(qnn_error));
-            return ggml_result;
-        }
-        graph_handle = instance->get_qnn_graph_handle();
-        //TBD: compose a single QNN graph
-
-        //finalize QNN graph
-        CHECK_QNN_API(qnn_error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-
-        //TBD: cache QNN graph
-    }
-    //TBD: exec QNN graph
-
-    return ggml_result;
-}

From 244deb9971a2b182ccd6f12ea47f753c3f031bc6 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 25 Mar 2025 22:29:28 +0800
Subject: [PATCH 134/200] ggml-qnn: original ggml_compute_forward_add and
 ggml_compute_forward_mul_mat works fine on Hexagon cDSP at the first time

---
 ggml/src/ggml-qnn/ggml-qnn.cpp               |  34 +--
 ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c   |  64 ++---
 ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h   |   9 +-
 ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c      | 279 +++++++++++++------
 ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c |  70 ++---
 5 files changed, 268 insertions(+), 188 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index fd1ee54a8cf28..909650124a9eb 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -179,8 +179,6 @@ static void   ggmlqnn_compute_diag_mask(ggml_backend_qnn_context * ctx, ggml_ten
 #endif
 #define GGMLQNN_DUMP_TENSOR(tensor)                     ggmlqnn_dump_tensor(tensor, #tensor)
 
-#define GGMLQNN_MEM_ADD(alignment)                      (sizeof (size_t) + alignment)
-#define GGMLQNN_MEM_MASK(alignment)                     ((uintptr_t)alignment - 1)
 #define QNN_VER_PTR(x)                                  (&((x).v1))
 #define RPCMEM_DEFAULT_FLAGS                            1
 #define RPCMEM_HEAP_ID_SYSTEM                           25
@@ -4230,18 +4228,13 @@ static bool ggmlhexagon_can_handle_op(const ggml_backend_qnn_context * ctx, cons
     uint32_t src1_rank  = 0;
     if (nullptr != src0) {
         src0_rank = ggml_n_dims(src0);
-    } else {
-        //GGMLQNN_LOG_DEBUG("op name %s\n", ggml_op_name(op_tensor->op));
     }
     if (nullptr != src1) {
         src1_rank = ggml_n_dims(src1);
-    } else {
-        //GGMLQNN_LOG_DEBUG("op name %s\n", ggml_op_name(op_tensor->op));
     }
 
-    //TODO: remove this filter in the future, mulmat on cDSP doesn't work as expected
-    //bool support =  ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
-    bool support =  (op_tensor->op == GGML_OP_ADD);
+    //TODO: only support offload GGML_OP_ADD and GGML_OP_MUL_MAT to cDSP directly
+    bool support =  ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
     if (!support)
         return false;
 
@@ -4251,21 +4244,17 @@ static bool ggmlhexagon_can_handle_op(const ggml_backend_qnn_context * ctx, cons
             if (!ggml_are_same_shape(src0, src1)) {
                 return false;
             }
-            return ggmlqnn_same_types(ctx, op_tensor);
+            return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
         }
 
         case GGML_OP_MUL_MAT:
         {
             ggmlqnn_dump_op_info(op_tensor);
-            if (src0_rank != src1_rank)
-                return false;
 
-            //TODO: remove this filter in the future
-            if (src0_rank != 2)
+            if (src1_rank != 2)
                 return false;
 
-            return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type))
-                       && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+            return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
 
         }
         default:
@@ -5110,6 +5099,8 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
             ggml_backend_qnn_free(qnn_backend);
             return nullptr;
         }
+        //ensure test-backend-ops get the correct backend name when inference approach is 1(DIRECT_USE_CDSP)
+        memcpy(g_qnn_mgr[device].name, "Hexagon-cDSP", strlen("Hexagon-cDSP"));
     }
 
     GGMLQNN_LOG_INFO("leave %s\n", __func__);
@@ -5564,11 +5555,6 @@ static void ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor
     const enum ggml_type src0_type              = src0->type;
     const uint32_t src0_rank                    = ggml_n_dims(src0);
     const uint32_t src1_rank                    = ggml_n_dims(src1);
-    GGML_ASSERT(src0_rank == src1_rank);
-    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
-    if (4 == src0_rank) {
-        return ggmlqnn_compute_mul_mat_4d(ctx, op);
-    }
 
     ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
 
@@ -5584,6 +5570,12 @@ static void ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor
         return;
     }
 
+    GGML_ASSERT(src0_rank == src1_rank);
+    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
+    if (4 == src0_rank) {
+        return ggmlqnn_compute_mul_mat_4d(ctx, op);
+    }
+
     void * wdata                                = ggmlqnn_type_trait(ctx, op);
     const size_t desired_size                   = ctx->desired_size;
 
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c
index 6f2c37e4087cc..82de512150bf8 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c
@@ -270,13 +270,13 @@ struct Interface {
 #define __QAIC_SLIM_EXPORT
 #endif
 
-static const Type types[5];
-static const Type* const typeArrays[6] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[0]),&(types[3])};
-static const StructType structTypes[1] = {{0x6,&(typeArrays[0]),0x58,0x4,0x50,0x8,0x4,0x8}};
-static const Type types[5] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x20,{{(const uintptr_t)&(types[2]),(const uintptr_t)0x4}}, 8,0x8},{0x8,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x8},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[4]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
-static const Parameter parameters[5] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x58,0x60),{{(const uintptr_t)&(structTypes[0]),0}}, 22,0x8,0,0},{SLIM_IFPTR32(0x58,0x60),{{(const uintptr_t)&(structTypes[0]),0}}, 22,0x8,3,0}};
+static const Type types[4];
+static const Type* const typeArrays[6] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[0]),&(types[2])};
+static const StructType structTypes[1] = {{0x6,&(typeArrays[0]),0x30,0x4,0x2c,0x4,0x4,0x4}};
+static const Type types[4] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x10,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x4}}, 8,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[3]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
+static const Parameter parameters[5] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}};
 static const Parameter* const parameterArrays[6] = {(&(parameters[3])),(&(parameters[3])),(&(parameters[4])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))};
-static const Method methods[3] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[3])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[5])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0xb4,0x50,3,3,(&(parameterArrays[0])),0x8,0x8}};
+static const Method methods[3] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[3])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[5])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0x64,0x2c,3,3,(&(parameterArrays[0])),0x4,0x4}};
 static const Method* const methodArrays[4] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[2])};
 static const char strings[68] = "mulmat\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0add\0uri\0op\0nb\0ne\0h\0";
 static const uint16_t methodStrings[49] = {0,34,29,62,59,56,7,24,19,29,62,59,56,7,24,44,29,62,59,56,7,24,48,34,29,62,59,56,7,24,19,29,62,59,56,7,24,44,29,62,59,56,7,24,39,52,65,13,65};
@@ -294,20 +294,20 @@ __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_open)(const char* uri, remote_hand
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_close)(remote_handle64 h) __QAIC_STUB_ATTRIBUTE {
    return __QAIC_REMOTE(remote_handle64_close)(h);
 }
-static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint64_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
+static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
    int _nErr = 0;
    remote_arg* _praROutPostStart = _praROutPost;
    remote_arg** _ppraROutPostStart = _ppraROutPost;
    _ppraROutPost = &_praROutPost;
    _COPY(_rout0, 0, _primROut, 0, 4);
-   _COPY(_rout1, 0, _primROut, 8, 32);
-   _COPY(_rout2, 0, _primROut, 40, 32);
-   _COPY(_rout3, 0, _primROut, 72, 4);
-   _COPY(_rout4, 0, _primROut, 76, 4);
+   _COPY(_rout1, 0, _primROut, 4, 16);
+   _COPY(_rout2, 0, _primROut, 20, 16);
+   _COPY(_rout3, 0, _primROut, 36, 4);
+   _COPY(_rout4, 0, _primROut, 40, 4);
    _ppraROutPostStart[0] += (_praROutPost - _praROutPostStart) +1;
    return _nErr;
 }
-static __inline int _stub_pack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint64_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
+static __inline int _stub_pack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
    int _nErr = 0;
    remote_arg* _praInStart = _praIn;
    remote_arg** _ppraInStart = _ppraIn;
@@ -322,7 +322,7 @@ static __inline int _stub_pack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNU
    _ppraROutStart[0] += (_praROut - _praROutStart) +1;
    return _nErr;
 }
-static __inline int _stub_pack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint64_t _in1[4], _ATTRIBUTE_UNUSED uint64_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[1], _ATTRIBUTE_UNUSED char* _in5[1], _ATTRIBUTE_UNUSED uint32_t _in5Len[1]) {
+static __inline int _stub_pack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[1], _ATTRIBUTE_UNUSED char* _in5[1], _ATTRIBUTE_UNUSED uint32_t _in5Len[1]) {
    int _nErr = 0;
    remote_arg* _praInStart = _praIn;
    remote_arg** _ppraInStart = _ppraIn;
@@ -331,38 +331,38 @@ static __inline int _stub_pack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_U
    _ppraIn = &_praIn;
    _ppraROut = &_praROut;
    _COPY(_primIn, 0, _in0, 0, 4);
-   _COPY(_primIn, 8, _in1, 0, 32);
-   _COPY(_primIn, 40, _in2, 0, 32);
-   _COPY(_primIn, 72, _in3, 0, 4);
-   _COPY(_primIn, 76, _in4, 0, 4);
-   _COPY(_primIn, 80, _in5Len, 0, 4);
+   _COPY(_primIn, 4, _in1, 0, 16);
+   _COPY(_primIn, 20, _in2, 0, 16);
+   _COPY(_primIn, 36, _in3, 0, 4);
+   _COPY(_primIn, 40, _in4, 0, 4);
+   _COPY(_primIn, 44, _in5Len, 0, 4);
    _praIn[0].buf.pv = (void*) _in5[0];
    _praIn[0].buf.nLen = (4 * _in5Len[0]);
    _ppraInStart[0] += (_praIn - _praInStart) + 1;
    _ppraROutStart[0] += (_praROut - _praROutStart) +0;
    return _nErr;
 }
-static __inline void _count(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint64_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
+static __inline void _count(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
    _numIn[0] += 0;
    _numROut[0] += 1;
    _numInH[0] += 0;
    _numROutH[0] += 0;
 }
-static __inline void _count_1(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint64_t _in1[4], _ATTRIBUTE_UNUSED uint64_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[1], _ATTRIBUTE_UNUSED char* _in5[1], _ATTRIBUTE_UNUSED uint32_t _in5Len[1]) {
+static __inline void _count_1(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[1], _ATTRIBUTE_UNUSED char* _in5[1], _ATTRIBUTE_UNUSED uint32_t _in5Len[1]) {
    _numIn[0] += 1;
    _numROut[0] += 0;
    _numInH[0] += 0;
    _numROutH[0] += 0;
 }
-static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint64_t _in0[SLIM_IFPTR32(11, 12)], uint64_t _in1[SLIM_IFPTR32(11, 12)], uint64_t _rout2[SLIM_IFPTR32(11, 12)]) {
+static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uintptr_t _in0[SLIM_IFPTR32(13, 8)], uintptr_t _in1[SLIM_IFPTR32(13, 8)], uintptr_t _rout2[SLIM_IFPTR32(13, 8)]) {
    remote_arg* _pra = 0;
    int _numIn[1] = {0};
    int _numROut[1] = {0};
    int _numInH[1] = {0};
    int _numROutH[1] = {0};
    _allocator _al[1] = {{0}};
-   uint64_t _primIn[23]= {0};
-   uint64_t _primROut[10]= {0};
+   uint32_t _primIn[25]= {0};
+   uint32_t _primROut[11]= {0};
    remote_arg* _praIn = 0;
    remote_arg* _praROut = 0;
    remote_arg* _praROutPost = 0;
@@ -378,9 +378,9 @@ static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint64_
    _numROut[0] = 0;
    _numInH[0] = 0;
    _numROutH[0] = 0;
-   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint64_t*)&(((uint64_t*)_in0)[1]), (uint64_t*)&(((uint64_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[18]), (uint32_t*)&(((uint32_t*)_in0)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[20]), (char**)&(((uint64_t*)_in0)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[21]), (uint32_t*)&(((uint32_t*)_in0)[22])));
-   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint64_t*)&(((uint64_t*)_in1)[1]), (uint64_t*)&(((uint64_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[18]), (uint32_t*)&(((uint32_t*)_in1)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[20]), (char**)&(((uint64_t*)_in1)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[21]), (uint32_t*)&(((uint32_t*)_in1)[22])));
-   _count(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[1]), (uint64_t*)&(((uint64_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[18]), (uint32_t*)&(((uint32_t*)_rout2)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[20]), (char**)&(((uint64_t*)_rout2)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[21]), (uint32_t*)&(((uint32_t*)_rout2)[22])));
+   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[11]), (char**)&(((uint64_t*)_in0)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[12]), (uint32_t*)&(((uint32_t*)_in0)[14])));
+   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[11]), (char**)&(((uint64_t*)_in1)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[12]), (uint32_t*)&(((uint32_t*)_in1)[14])));
+   _count(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[11]), (char**)&(((uint64_t*)_rout2)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[12]), (uint32_t*)&(((uint32_t*)_rout2)[14])));
    if(_numIn[0]>=255){
           _QAIC_FARF(RUNTIME_ERROR, "ERROR: Unsupported number of input buffers\n");
           return AEE_EUNSUPPORTED;
@@ -405,13 +405,13 @@ static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint64_
    }
    if(_praHROut == 0)
       (_praHROut = _praHIn + _numInH[0] + 0);
-   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint64_t*)&(((uint64_t*)_in0)[1]), (uint64_t*)&(((uint64_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[18]), (uint32_t*)&(((uint32_t*)_in0)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[20]), (char**)&(((uint64_t*)_in0)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[21]), (uint32_t*)&(((uint32_t*)_in0)[22]))));
-   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 88), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint64_t*)&(((uint64_t*)_in1)[1]), (uint64_t*)&(((uint64_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[18]), (uint32_t*)&(((uint32_t*)_in1)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[20]), (char**)&(((uint64_t*)_in1)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[21]), (uint32_t*)&(((uint32_t*)_in1)[22]))));
-   _TRY(_nErr, _stub_pack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 176), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[1]), (uint64_t*)&(((uint64_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[18]), (uint32_t*)&(((uint32_t*)_rout2)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[20]), (char**)&(((uint64_t*)_rout2)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[21]), (uint32_t*)&(((uint32_t*)_rout2)[22]))));
+   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[11]), (char**)&(((uint64_t*)_in0)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[12]), (uint32_t*)&(((uint32_t*)_in0)[14]))));
+   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 48), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[11]), (char**)&(((uint64_t*)_in1)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[12]), (uint32_t*)&(((uint32_t*)_in1)[14]))));
+   _TRY(_nErr, _stub_pack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 96), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[11]), (char**)&(((uint64_t*)_rout2)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[12]), (uint32_t*)&(((uint32_t*)_rout2)[14]))));
    _QAIC_ASSERT(_nErr, (_numInH[0] + 0) <= 15);
    _QAIC_ASSERT(_nErr, (_numROutH[0] + 0) <= 15);
    _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _pra));
-   _TRY(_nErr, _stub_unpack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[1]), (uint64_t*)&(((uint64_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[18]), (uint32_t*)&(((uint32_t*)_rout2)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[20]), (char**)&(((uint64_t*)_rout2)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[21]), (uint32_t*)&(((uint32_t*)_rout2)[22]))));
+   _TRY(_nErr, _stub_unpack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[11]), (char**)&(((uint64_t*)_rout2)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[12]), (uint32_t*)&(((uint32_t*)_rout2)[14]))));
    _QAIC_CATCH(_nErr) {}
    _CATCH_FARF(_nErr) {
       _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _mid, __func__);
@@ -421,9 +421,9 @@ static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint64_
 }
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_add)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
    uint32_t _mid = 2;
-   return _stub_method(_handle, _mid, (uint64_t*)src0, (uint64_t*)src1, (uint64_t*)dst);
+   return _stub_method(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
 }
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_mulmat)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
    uint32_t _mid = 3;
-   return _stub_method(_handle, _mid, (uint64_t*)src0, (uint64_t*)src1, (uint64_t*)dst);
+   return _stub_method(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
 }
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h
index 1273cb76b1797..8e05d06f1c2ba 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h
@@ -6,7 +6,6 @@
 #include <string.h>
 #include <stdlib.h>
 
-
 #ifndef __QAIC_HEADER
 #define __QAIC_HEADER(ff) ff
 #endif //__QAIC_HEADER
@@ -240,8 +239,8 @@ typedef struct _cstring1_s {
 typedef struct dsptensor dsptensor;
 struct dsptensor {
    int32_t type;
-   int64_t ne[4];
-   int64_t nb[4];
+   int32_t ne[4];
+   int32_t nb[4];
    int32_t op;
    int32_t flags;
    void * data;
@@ -277,8 +276,8 @@ __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_open)(const char* uri, remote_
     * @retval, 0 on success, should always succeed
     */
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_close)(remote_handle64 h) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_add)(remote_handle64 _h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_mulmat)(remote_handle64 _h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_add)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_mulmat)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
 #ifndef ggmlop_URI
 #define ggmlop_URI "file:///libggmlop_skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1"
 #endif /*ggmlop_URI*/
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
index bdb39fdf1f2a3..85d0ad8c8e29e 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
@@ -25,20 +25,35 @@
 #include <stdbool.h>
 #include <stdint.h>
 #include <assert.h>
+
 #include "HAP_farf.h"
+#include "HAP_compute_res.h"
+#include "hexagon_types.h"
+#include "AEEStdErr.h"
+
 #include "ggmlop_ap_skel.h"
 
+// =================================================================================================
+//  section-1: forward/prototype declaration,global vars,macros,data structures
+// =================================================================================================
 #define ggml_tensor         dsptensor
 
 #define GGML_MAX_DIMS       4
+
 #define GGML_UNUSED(x)      (void)(x)
+
 #define UNUSED              GGML_UNUSED
+
 #define GGML_PAD(x, n)      (((x) + (n) - 1) & ~((n) - 1))
+
 #define GGML_ABORT(...)     ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
+
 #define GGML_ASSERT(x)      if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
+
 #define MIN(a, b)           ((a) < (b) ? (a) : (b))
 #define MAX(a, b)           ((a) > (b) ? (a) : (b))
 #define SWAP(x, y, T)       do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
+
 #if UINTPTR_MAX == 0xFFFFFFFF
 #define GGML_MEM_ALIGN      4
 #else
@@ -49,9 +64,39 @@
 
 #define static_assert(a, b) do { } while (0)
 
-typedef uint16_t ggml_fp16_t;
+typedef double      ggml_float;
+typedef uint16_t    ggml_fp16_t;
 typedef struct { uint16_t bits; } ggml_bf16_t;
-typedef double ggml_float;
+
+static void ggmlhexagon_log_internal(int level, const char * file, const char * func, int line, const char * format, ...);
+
+enum ggmlhexagon_log_level {
+    GGMLHEXAGON_LOG_LEVEL_NONE  = 0,
+    GGMLHEXAGON_LOG_LEVEL_DEBUG = 1,
+    GGMLHEXAGON_LOG_LEVEL_INFO  = 2,
+    GGMLHEXAGON_LOG_LEVEL_WARN  = 3,
+    GGMLHEXAGON_LOG_LEVEL_ERROR = 4,
+    GGMLHEXAGON_LOG_LEVEL_CONT  = 5,
+};
+
+#if 0//def NDEBUG
+#define GGMLQNN_DEBUG                                       0
+#else
+#define GGMLQNN_DEBUG                                       1
+#endif
+
+#define GGMLHEXAGON_LOGBUF_LEN                              4096
+#define GGMLHEXAGON_TMPBUF_LEN                              256
+
+#define GGMLHEXAGON_LOG_ERROR(...)                          ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLHEXAGON_LOG_WARN(...)                           ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLHEXAGON_LOG_INFO(...)                           ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#if GGMLQNN_DEBUG
+#define GGMLHEXAGON_LOG_DEBUG(...)                          ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+#define GGMLHEXAGON_LOG_DEBUG(...)
+#endif
+#define GGMLQNN_DUMP_TENSOR(tensor)                         ggmlhexagon_dump_tensor(tensor, #tensor)
 
 
 #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
@@ -133,6 +178,7 @@ enum ggml_type {
     GGML_TYPE_COUNT   = 39,
 };
 
+
 static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
 static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
 static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
@@ -179,19 +225,54 @@ static const struct ggml_type_traits type_traits[1] = {
 
 };
 
+// =================================================================================================
+//  section-2: ggml-hexagon kernel's internal troubleshooting function
+// =================================================================================================
+static void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) {
+    return;
+    static char s_ggmlhexagon_log_internal_buf[GGMLHEXAGON_LOGBUF_LEN];
+    va_list args;
+    va_start(args, format);
+    int len_prefix = snprintf(s_ggmlhexagon_log_internal_buf, GGMLHEXAGON_LOGBUF_LEN, "[%s, %d]: ",
+                              func, line);
+    int len = vsnprintf(s_ggmlhexagon_log_internal_buf + len_prefix,
+                        GGMLHEXAGON_LOGBUF_LEN - len_prefix, format, args);
+    if (len < (GGMLHEXAGON_LOGBUF_LEN - len_prefix)) {
+
+        FARF(ALWAYS, "%s\n", s_ggmlhexagon_log_internal_buf);
+    }
+    va_end(args);
+}
+
+static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor) {
+    GGMLHEXAGON_LOG_DEBUG("ne = %5d x %5d x %5d x %5d , nb = (%5zi, %5zi, %5zi, %5zi)\n",
+         tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
+         tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[3]);
+}
+
+static void ggml_abort(const char * file, int line, const char * fmt, ...) {
+    GGMLHEXAGON_LOG_DEBUG("enter ggml_abort");
+    //abort();
+    return;
+}
+
+// =================================================================================================
+//  section-3: ggml-hexagon kernel's helper function(tiny ggml-dsp, ported from original ggml)
+// =================================================================================================
 static const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
     return &type_traits_cpu[type];
 }
 
-static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc) {
+static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x,
+                             size_t bx, const float *GGML_RESTRICT y, size_t by, int nrc) {
     assert(nrc == 1);
-            UNUSED(nrc);
-            UNUSED(bx);
-            UNUSED(by);
-            UNUSED(bs);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
     ggml_float sumf = 0.0;
     for (int i = 0; i < n; ++i) {
-        sumf += (ggml_float)(x[i]*y[i]);
+        sumf += (ggml_float) (x[i] * y[i]);
     }
     *s = sumf;
 }
@@ -269,7 +350,6 @@ static bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_ten
 
 static bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
-
     return
             (t0->ne[0] == t1->ne[0]) &&
             (t0->ne[1] == t1->ne[1]) &&
@@ -317,17 +397,8 @@ static bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
     return ggml_is_contiguous_0(tensor);
 }
 
-inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i]; }
-
-static void ggml_dump_tensor(const ggml_tensor * tensor) {
-    FARF(HIGH, "ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)\n",
-         tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
-         tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[3]);
-}
-
-static void ggml_abort(const char * file, int line, const char * fmt, ...) {
-    //abort();
-    return;
+inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
+    for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i];
 }
 
 int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
@@ -345,40 +416,31 @@ int ggmlop_dsp_close(remote_handle64 handle) {
     return 0;
 }
 
+// =================================================================================================
+//  section-4: ggml-hexagon kernel function
+// =================================================================================================
 static void ggml_compute_forward_add_f32(
-        const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+        struct ggml_tensor * src0,
+        struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    memcpy(dst->ne, src1->ne, 16);
+    memcpy(dst->nb, src1->nb, 16);
+
     GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
 
-    const int nr  = ggml_nrows(src0);
+    const int ith = 0;
+    const int nth = 1;
 
+    const int nr  = ggml_nrows(src0);
     GGML_TENSOR_BINARY_OP_LOCALS
 
     GGML_ASSERT( nb0 == sizeof(float));
     GGML_ASSERT(nb00 == sizeof(float));
 
-    const int dr = nr;
-
-    // row range for this thread
-    const int ir0 = 0;
+    const int dr = (nr + nth - 1)/nth;
+    const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr);
-
-    ggml_dump_tensor(src0);
-    ggml_dump_tensor(src1);
-
-#if 1 //naive algorithm for fp32, can works with llama-cli
-    float * a = (float*)src0->data;
-    float * b = (float*)src1->data;
-    float * c = (float*)dst->data;
-    //TODO: Hexagon SIMD
-    for (size_t idx = 0; idx < src0->data_len; idx++) {
-        *c = *a + *b;
-        a++;
-        b++;
-        c++;
-    }
-    return;
-#endif
-
     if (nb10 == sizeof(float)) {
         for (int ir = ir0; ir < ir1; ++ir) {
             // src1 is broadcastable across src0 and dst in i1, i2, i3
@@ -394,9 +456,12 @@ static void ggml_compute_forward_add_f32(
             float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
             float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
             float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
             for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
+#else
                 ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
             }
         }
     } else {
@@ -422,11 +487,12 @@ static void ggml_compute_forward_add_f32(
             }
         }
     }
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
 }
 
 int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst)
 {
-    FARF(HIGH, "===============     DSP: ggmlop_dsp_add ");
+    GGMLHEXAGON_LOG_DEBUG("enter ggmlop_dsp_add\n");
     switch (src0->type) {
         case GGML_TYPE_F32:
         {
@@ -442,21 +508,34 @@ int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tenso
             GGML_ABORT("fatal error");
         }
     }
-
+    GGMLHEXAGON_LOG_DEBUG("leave ggmlop_dsp_add\n");
     return 0;
 }
 
-
 static void ggml_compute_forward_mul_mat_one_chunk(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst,
         const enum ggml_type type,
-        const int64_t num_rows_per_vec_dot,
-        const int64_t ir0_start,
-        const int64_t ir0_end,
-        const int64_t ir1_start,
-        const int64_t ir1_end) {
+        const int32_t num_rows_per_vec_dot,
+        const int32_t ir0_start,
+        const int32_t ir0_end,
+        const int32_t ir1_start,
+        const int32_t ir1_end) {
+    ggmlhexagon_dump_tensor(src0);
+    ggmlhexagon_dump_tensor(src1);
+    ggmlhexagon_dump_tensor(dst);
+
+    dst->ne[0] = src0->ne[1];
+    dst->ne[1] = src1->ne[1];
+    dst->ne[2] = src1->ne[2];
+    dst->ne[3] = src1->ne[3];
+
+    dst->nb[0] = ggml_type_size(src1->type);
+    dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type));
+    dst->nb[2] = dst->nb[1] * dst->ne[1];
+    dst->nb[3] = dst->nb[2] * dst->ne[2];
+    ggmlhexagon_dump_tensor(dst);
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
@@ -466,8 +545,8 @@ static void ggml_compute_forward_mul_mat_one_chunk(
     enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
 
     // broadcast factors
-    const int64_t r2 = ne12 / ne02;
-    const int64_t r3 = ne13 / ne03;
+    const int32_t r2 = ne12 / ne02;
+    const int32_t r3 = ne13 / ne03;
 
     if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
        return;
@@ -481,8 +560,8 @@ static void ggml_compute_forward_mul_mat_one_chunk(
     assert(ne13 % ne03 == 0);
 
     // block-tiling attempt
-    const int64_t blck_0 = 16;
-    const int64_t blck_1 = 16;
+    const int32_t blck_0 = 16;
+    const int32_t blck_1 = 16;
 
     const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
 
@@ -490,30 +569,38 @@ static void ggml_compute_forward_mul_mat_one_chunk(
     // 16 * 2, accounting for mmla kernels
     float tmp[32];
 
-    for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
-        for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
-            for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
-                const int64_t i13 = (ir1 / (ne12 * ne1));
-                const int64_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
-                const int64_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
+    for (int32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
+        for (int32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
+            for (int32_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
+                const int32_t i13 = (ir1 / (ne12 * ne1));
+                const int32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
+                const int32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
 
                 // broadcast src0 into src1
-                const int64_t i03 = i13 / r3;
-                const int64_t i02 = i12 / r2;
+                const int32_t i03 = i13 / r3;
+                const int32_t i02 = i12 / r2;
 
-                const int64_t i1 = i11;
-                const int64_t i2 = i12;
-                const int64_t i3 = i13;
+                const int32_t i1 = i11;
+                const int32_t i2 = i12;
+                const int32_t i3 = i13;
 
                 const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
 
+                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                //       the original src1 data pointer, so we should index using the indices directly
+                // TODO: this is a bit of a hack, we should probably have a better way to handle this
                 const char * src1_col = (const char*)wdata +
                                         (src1_cont || src1->type != vec_dot_type
                                          ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
                                          : (i11 * nb11 + i12 * nb12 + i13 * nb13));
                 float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
 
-                for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
+                //for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
+                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+                //}
+
+                for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
                     vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
                 }
 
@@ -525,13 +612,27 @@ static void ggml_compute_forward_mul_mat_one_chunk(
     }
 }
 
-int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+     ggmlhexagon_dump_tensor(src0);
+     ggmlhexagon_dump_tensor(src1);
+     ggmlhexagon_dump_tensor(dst);
+
+     dst->ne[0] = src0->ne[1];
+     dst->ne[1] = src1->ne[1];
+     dst->ne[2] = src1->ne[2];
+     dst->ne[3] = src1->ne[3];
+
+     dst->nb[0] = ggml_type_size(src1->type);
+     dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type));
+     dst->nb[2] = dst->nb[1] * dst->ne[1];
+     dst->nb[3] = dst->nb[2] * dst->ne[2];
+     ggmlhexagon_dump_tensor(dst);
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
     enum ggml_type           const vec_dot_type         = type_traits_cpu[src0->type].vec_dot_type;
     ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
-    int64_t                  const vec_dot_num_rows     = type_traits_cpu[src0->type].nrows;
+    int32_t                  const vec_dot_num_rows     = type_traits_cpu[src0->type].nrows;
 
     GGML_ASSERT(ne0 == ne01);
     GGML_ASSERT(ne1 == ne11);
@@ -548,10 +649,10 @@ int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_te
     GGML_ASSERT(nb1 <= nb2);
     GGML_ASSERT(nb2 <= nb3);
 
-#if 1 //naive algorithm for fp32, can pass various case in UT
-     {
-        ggml_dump_tensor(src0);
-        ggml_dump_tensor(src1);
+#if 0 //naive algorithm for fp32, can pass various case in UT
+    {
+        //ggml_dump_tensor(src0);
+        //ggml_dump_tensor(src1);
 
         float * a = (float*)src0->data;
         float * b = (float*)src1->data;
@@ -574,10 +675,10 @@ int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_te
 #endif
 
     // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
-    const int64_t nr0 = ne0;
+    const int32_t nr0 = ne0;
 
     // This is the size of the rest of the dimensions of the result
-    const int64_t nr1 = ne1 * ne2 * ne3;
+    const int32_t nr1 = ne1 * ne2 * ne3;
 
     // Now select a reasonable chunk size.
     int chunk_size = 16;
@@ -590,8 +691,8 @@ int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_te
     // distribute the work across the inner or outer loop based on which one is larger
     // The number of chunks in the 0/1 dim.
     // CEIL(nr0/chunk_size)
-    int64_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
-    int64_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
+    int32_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
+    int32_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
 
     // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
     //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggml-org/llama.cpp/pull/6915
@@ -603,24 +704,24 @@ int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_te
     }
 
     // The number of elements in each chunk
-    const int64_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
-    const int64_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
+    const int32_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
+    const int32_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
 
     // The first chunk comes from our thread_id, the rest will get auto-assigned.
     int current_chunk = 0;
 
     while (current_chunk < nchunk0 * nchunk1) {
-        const int64_t ith0 = current_chunk % nchunk0;
-        const int64_t ith1 = current_chunk / nchunk0;
+        const int32_t ith0 = current_chunk % nchunk0;
+        const int32_t ith1 = current_chunk / nchunk0;
 
-        const int64_t ir0_start = dr0 * ith0;
-        const int64_t ir0_end = MIN(ir0_start + dr0, nr0);
+        const int32_t ir0_start = dr0 * ith0;
+        const int32_t ir0_end = MIN(ir0_start + dr0, nr0);
 
-        const int64_t ir1_start = dr1 * ith1;
-        const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
+        const int32_t ir1_start = dr1 * ith1;
+        const int32_t ir1_end = MIN(ir1_start + dr1, nr1);
 
         // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
-        int64_t num_rows_per_vec_dot = vec_dot_num_rows;
+        int32_t num_rows_per_vec_dot = vec_dot_num_rows;
 
         // these checks are needed to avoid crossing dim1 boundaries
         // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
@@ -635,5 +736,5 @@ int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_te
         current_chunk++;
     }
 
-   return 0;
+    return 0;
 }
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c
index 9d6b64fd6b570..58bf1a846742f 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c
@@ -7,7 +7,6 @@
 
 #include <stdlib.h>
 #include <stdint.h>
-#include "version_note.h"
 #include "ggmlop_ap_skel.h"
 
 typedef struct _heap _heap;
@@ -270,13 +269,13 @@ struct Interface {
 #define __QAIC_SLIM_EXPORT
 #endif
 
-static const Type types[5];
-static const Type* const typeArrays[6] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[0]),&(types[3])};
-static const StructType structTypes[1] = {{0x6,&(typeArrays[0]),0x58,0x4,0x50,0x8,0x4,0x8}};
-static const Type types[5] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x20,{{(const uintptr_t)&(types[2]),(const uintptr_t)0x4}}, 8,0x8},{0x8,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x8},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[4]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
-static const Parameter parameters[5] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x58,0x60),{{(const uintptr_t)&(structTypes[0]),0}}, 22,0x8,0,0},{SLIM_IFPTR32(0x58,0x60),{{(const uintptr_t)&(structTypes[0]),0}}, 22,0x8,3,0}};
+static const Type types[4];
+static const Type* const typeArrays[6] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[0]),&(types[2])};
+static const StructType structTypes[1] = {{0x6,&(typeArrays[0]),0x30,0x4,0x2c,0x4,0x4,0x4}};
+static const Type types[4] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x10,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x4}}, 8,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[3]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
+static const Parameter parameters[5] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}};
 static const Parameter* const parameterArrays[6] = {(&(parameters[3])),(&(parameters[3])),(&(parameters[4])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))};
-static const Method methods[3] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[3])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[5])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0xb4,0x50,3,3,(&(parameterArrays[0])),0x8,0x8}};
+static const Method methods[3] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[3])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[5])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0x64,0x2c,3,3,(&(parameterArrays[0])),0x4,0x4}};
 static const Method* const methodArrays[4] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[2])};
 static const char strings[68] = "mulmat\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0add\0uri\0op\0nb\0ne\0h\0";
 static const uint16_t methodStrings[49] = {0,34,29,62,59,56,7,24,19,29,62,59,56,7,24,44,29,62,59,56,7,24,48,34,29,62,59,56,7,24,19,29,62,59,56,7,24,44,29,62,59,56,7,24,39,52,65,13,65};
@@ -289,20 +288,20 @@ extern "C" {
 #endif
 _ATTRIBUTE_VISIBILITY uint32_t ggmlop_skel_handle_invoke_qaic_version = 10048;
 _ATTRIBUTE_VISIBILITY char ggmlop_skel_handle_invoke_uri[77+1]="file:///libggmlop_skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1";
-static __inline int _skel_pack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint64_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
+static __inline int _skel_pack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
    int _nErr = 0;
    remote_arg* _praROutPostStart = _praROutPost;
    remote_arg** _ppraROutPostStart = _ppraROutPost;
    _ppraROutPost = &_praROutPost;
    _COPY(_primROut, 0, _rout0, 0, 4);
-   _COPY(_primROut, 8, _rout1, 0, 32);
-   _COPY(_primROut, 40, _rout2, 0, 32);
-   _COPY(_primROut, 72, _rout3, 0, 4);
-   _COPY(_primROut, 76, _rout4, 0, 4);
+   _COPY(_primROut, 4, _rout1, 0, 16);
+   _COPY(_primROut, 20, _rout2, 0, 16);
+   _COPY(_primROut, 36, _rout3, 0, 4);
+   _COPY(_primROut, 40, _rout4, 0, 4);
    _ppraROutPostStart[0] += (_praROutPost - _praROutPostStart) +1;
    return _nErr;
 }
-static __inline int _skel_unpack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint64_t _rout1[4], _ATTRIBUTE_UNUSED uint64_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
+static __inline int _skel_unpack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
    int _nErr = 0;
    remote_arg* _praInStart = _praIn;
    remote_arg** _ppraInStart = _ppraIn;
@@ -318,7 +317,7 @@ static __inline int _skel_unpack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_U
    _QAIC_CATCH(_nErr) {}
    return _nErr;
 }
-static __inline int _skel_unpack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint64_t _in1[4], _ATTRIBUTE_UNUSED uint64_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[1], _ATTRIBUTE_UNUSED char* _in5[1], _ATTRIBUTE_UNUSED uint32_t _in5Len[1]) {
+static __inline int _skel_unpack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[1], _ATTRIBUTE_UNUSED char* _in5[1], _ATTRIBUTE_UNUSED uint32_t _in5Len[1]) {
    int _nErr = 0;
    remote_arg* _praInStart = _praIn;
    remote_arg** _ppraInStart = _ppraIn;
@@ -327,11 +326,11 @@ static __inline int _skel_unpack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE
    _ppraIn = &_praIn;
    _ppraROut = &_praROut;
    _COPY(_in0, 0, _primIn, 0, 4);
-   _COPY(_in1, 0, _primIn, 8, 32);
-   _COPY(_in2, 0, _primIn, 40, 32);
-   _COPY(_in3, 0, _primIn, 72, 4);
-   _COPY(_in4, 0, _primIn, 76, 4);
-   _COPY(_in5Len, 0, _primIn, 80, 4);
+   _COPY(_in1, 0, _primIn, 4, 16);
+   _COPY(_in2, 0, _primIn, 20, 16);
+   _COPY(_in3, 0, _primIn, 36, 4);
+   _COPY(_in4, 0, _primIn, 40, 4);
+   _COPY(_in5Len, 0, _primIn, 44, 4);
    _QAIC_ASSERT(_nErr, ((_praIn[0].buf.nLen / 4)) >= (size_t)(_in5Len[0]));
    _in5[0] = _praIn[0].buf.pv;
    _ppraInStart[0] += (_praIn - _praInStart) + 1;
@@ -341,12 +340,12 @@ static __inline int _skel_unpack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE
 }
 static __inline int _skel_method(int (*_pfn)(remote_handle64, const dsptensor*, const dsptensor*, dsptensor*), remote_handle64 _h, uint32_t _sc, remote_arg* _pra) {
    remote_arg* _praEnd = 0;
-   uint64_t _in0[SLIM_IFPTR32(11, 12)] = {0};
-   uint64_t _in1[SLIM_IFPTR32(11, 12)] = {0};
-   uint64_t _rout2[SLIM_IFPTR32(11, 12)] = {0};
-   uint64_t* _primIn= 0;
+   uintptr_t _in0[SLIM_IFPTR32(13, 8)] = {0};
+   uintptr_t _in1[SLIM_IFPTR32(13, 8)] = {0};
+   uintptr_t _rout2[SLIM_IFPTR32(13, 8)] = {0};
+   uint32_t* _primIn= 0;
    int _numIn[1] = {0};
-   uint64_t* _primROut= 0;
+   uint32_t* _primROut= 0;
    int _numInH[1] = {0};
    int _numROut[1] = {0};
    remote_arg* _praIn = 0;
@@ -368,9 +367,9 @@ static __inline int _skel_method(int (*_pfn)(remote_handle64, const dsptensor*,
    _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==0);
    _QAIC_ASSERT(_nErr, (_pra + ((1 + 1) + (((0 + 0) + 0) + 0))) <= _praEnd);
    _numIn[0] = (REMOTE_SCALARS_INBUFS(_sc) - 1);
-   _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 180);
+   _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 100);
    _primIn = _pra[0].buf.pv;
-   _QAIC_ASSERT(_nErr, _pra[(_numIn[0] + 1)].buf.nLen >= 80);
+   _QAIC_ASSERT(_nErr, _pra[(_numIn[0] + 1)].buf.nLen >= 44);
    _primROut = _pra[(_numIn[0] + 1)].buf.pv;
    _numInH[0] = REMOTE_SCALARS_INHANDLES(_sc);
    _numROut[0] = REMOTE_SCALARS_OUTBUFS(_sc);
@@ -384,11 +383,11 @@ static __inline int _skel_method(int (*_pfn)(remote_handle64, const dsptensor*,
    }
    if(_praHROut == 0)
       (_praHROut = _praHIn + _numInH[0] + 0);
-   _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint64_t*)&(((uint64_t*)_in0)[1]), (uint64_t*)&(((uint64_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[18]), (uint32_t*)&(((uint32_t*)_in0)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[20]), (char**)&(((uint64_t*)_in0)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[21]), (uint32_t*)&(((uint32_t*)_in0)[22]))));
-   _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 88), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint64_t*)&(((uint64_t*)_in1)[1]), (uint64_t*)&(((uint64_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[18]), (uint32_t*)&(((uint32_t*)_in1)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[20]), (char**)&(((uint64_t*)_in1)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[21]), (uint32_t*)&(((uint32_t*)_in1)[22]))));
-   _TRY(_nErr, _skel_unpack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 176), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[1]), (uint64_t*)&(((uint64_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[18]), (uint32_t*)&(((uint32_t*)_rout2)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[20]), (char**)&(((uint64_t*)_rout2)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[21]), (uint32_t*)&(((uint32_t*)_rout2)[22]))));
+   _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[11]), (char**)&(((uint64_t*)_in0)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[12]), (uint32_t*)&(((uint32_t*)_in0)[14]))));
+   _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 48), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[11]), (char**)&(((uint64_t*)_in1)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[12]), (uint32_t*)&(((uint32_t*)_in1)[14]))));
+   _TRY(_nErr, _skel_unpack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 96), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[11]), (char**)&(((uint64_t*)_rout2)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[12]), (uint32_t*)&(((uint32_t*)_rout2)[14]))));
    _TRY(_nErr, _pfn(_h, (const dsptensor*)_in0, (const dsptensor*)_in1, (dsptensor*)_rout2));
-   _TRY(_nErr, _skel_pack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint64_t*)&(((uint64_t*)_rout2)[1]), (uint64_t*)&(((uint64_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[18]), (uint32_t*)&(((uint32_t*)_rout2)[19]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[20]), (char**)&(((uint64_t*)_rout2)[10])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[21]), (uint32_t*)&(((uint32_t*)_rout2)[22]))));
+   _TRY(_nErr, _skel_pack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[11]), (char**)&(((uint64_t*)_rout2)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[12]), (uint32_t*)&(((uint32_t*)_rout2)[14]))));
    _QAIC_CATCH(_nErr) {}
    _allocator_deinit(_al);
    return _nErr;
@@ -583,14 +582,3 @@ __QAIC_SKEL_EXPORT int __QAIC_SKEL(ggmlop_skel_handle_invoke)(remote_handle64 _h
    }
    return AEE_EUNSUPPORTED;
 }
-
-/* Library version needs to be added in the name member of note_type structure in below format
- * "lib.ver.1.0.0." + "<library_name>" + ":" + "<version>"
- */
-const lib_ver_note_t so_ver __attribute__ ((section (".note.lib.ver")))
-        __attribute__ ((visibility ("default"))) = {
-                100,
-                0,
-                0,
-                "lib.ver.1.0.0.libggmlop_skel.so:4.5.0",
-        };

From 639605b2f57d22b17046854fdb3b76e5c85c81a4 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 25 Mar 2025 23:08:39 +0800
Subject: [PATCH 135/200] ggml-qnn: modify build-run-android.sh to verify
 mulmat and validate mulmat performance on cDSP easily

---
 scripts/build-run-android.sh | 46 ++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index d3c47e0473bcd..2e4143b5acca3 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -236,6 +236,38 @@ function run_test-ops()
 
 }
 
+function run_test-op()
+{
+    prepare_run_on_phone test-backend-ops
+
+    qnnbackendname=qnn-cpu
+    case $qnnbackend in
+        0)
+        qnnbackendname=qnn-cpu
+        ;;
+        1)
+        qnnbackendname=qnn-gpu
+        ;;
+        2)
+        qnnbackendname=qnn-npu
+        ;;
+        *)
+        qnnbackendname=qnn-cpu
+        ;;
+    esac
+
+    #debug
+    echo "adb shell cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test -o $opname -b $qnnbackendname "
+
+    echo "\n"
+    adb shell "cd ${REMOTE_PATH} \
+               && export LD_LIBRARY_PATH=${REMOTE_PATH} \
+               && ${REMOTE_PATH}/test-backend-ops test -o $opname -b $qnnbackendname "
+
+}
+
 
 function print_oplist()
 {
@@ -325,6 +357,7 @@ function show_usage()
     echo "  $0 build"
     echo "  $0 updateqnnlib"
     echo "  $0 run_testops"
+    echo "  $0 run_testop          [ADD/MUL_MAT]  [0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU)]"
     echo "  $0 run_llamacli"
     echo "  $0 run_llamabench"
 
@@ -370,6 +403,19 @@ elif [ $# == 1 ]; then
         show_usage
         exit 1
     fi
+elif [ $# == 3 ]; then
+    opname=$2
+#TODO: check opname in oplist
+#opname can be found via print_oplist:
+
+    qnnbackend=$3
+    if [ ${qnnbackend} -gt 3 ]; then
+        show_usage
+        exit 1
+    fi
+
+    run_test-op
+    exit 0
 else
     show_usage
     exit 1

From c2a21d24d6caec6728f320bed43ee983dbcb31d4 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 26 Mar 2025 19:41:58 +0800
Subject: [PATCH 136/200] ggml-qnn: make host code(ggml-qnn.cpp) more clear and
 more stable

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 167 +++++++++++++++++++++++----------
 scripts/ggml-qnn.cfg           |  12 ++-
 2 files changed, 126 insertions(+), 53 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index 909650124a9eb..c28d01d134b29 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -23,7 +23,7 @@
  *   this is a complicated skeleton, can expand other ggml ops accordingly
  *
  *  currently provide following ggml op' implementation through Hexagon DSP:
- * - GGML_OP_ADD:
+ * - GGML_OP_ADD & GGML_OP_MUL_MAT:
  *   this is a skeleton, can expand other ggml ops accordingly
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -343,6 +343,7 @@ struct ggml_backend_qnn_context {
     size_t rpc_mempool_len;
     void * rpc_mempool;
     remote_handle64 ggmlop_handle;
+    int domain_id;
 };
 
 struct qnn_op_caps {
@@ -363,6 +364,8 @@ struct qnn_parameter {
     int enable_dlbc;
     int inference_approach;     // 0: QNN_GENERAL     1: DIRECT_USE_CDSP 2: QNN_SINGELGRAPH
     int qnn_backend;            // 0: QNN-CPU backend 1: QNN-GPU backend 2: QNN-NPU backend
+    int enable_mulmat_cdsp;     // enable/disable offload mulmat to cDSP
+    int enable_q_mulmat;        // enable/disable offload fp32 & all quantized type mulmat to cDSP
     const char * qnn_cfgfilename;
     const char * qnn_runtimelib_path;
 };
@@ -381,6 +384,8 @@ static struct qnn_parameter g_qnn_params = {
         .enable_dlbc            = 1,
         .inference_approach     = 0,
         .qnn_backend            = 2, //default is QNN-NPU backend
+        .enable_mulmat_cdsp     = 0,
+        .enable_q_mulmat        = 0,
         .qnn_cfgfilename        = "ggml-qnn.cfg",
 #if defined(__ANDROID__)
 //Android command line program
@@ -1451,7 +1456,7 @@ static int ggmlhexagon_get_dsp_support(int * domain) {
     return hexagon_error;
 }
 
-static int ggmlhexagon_get_vtcm_info(int domain, uint32_t * capability, uint32_t attr) {
+static int ggmlhexagon_get_vtcm_info(int domain, uint32_t attr, uint32_t * capability) {
     int hexagon_error = AEE_SUCCESS;
     *capability = 0;
 
@@ -1633,7 +1638,7 @@ static bool ggmlhexagon_is_status_notification_supported(int domain) {
     return false;
 }
 
-static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t * capability, uint32_t attr) {
+static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t attr, uint32_t * capability) {
     int hexagon_error = AEE_SUCCESS;
     *capability = 0;
 
@@ -1679,7 +1684,7 @@ static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t * capability, u
     return hexagon_error;
 }
 
-static int ggmlhexagon_get_hex_arch_ver(int domain, uint32_t * capability) {
+static int ggmlhexagon_get_hvx_arch_ver(int domain, uint32_t * capability) {
     int hexagon_error = AEE_SUCCESS;
     *capability = 0;
     if(remote_handle_control) {
@@ -1696,7 +1701,7 @@ static int ggmlhexagon_get_hex_arch_ver(int domain, uint32_t * capability) {
             hexagon_error = AEE_SUCCESS;
             goto bail;
         } else if (hexagon_error == AEE_SUCCESS) {
-            *capability = dsp_capability_arch_ver.capability;
+            *capability = dsp_capability_arch_ver.capability & 0xFF;
         } else {
             GGMLQNN_LOG_DEBUG("get_hex_arch_ver failed with error 0x%x", hexagon_error);
             goto bail;
@@ -1710,7 +1715,7 @@ static int ggmlhexagon_get_hex_arch_ver(int domain, uint32_t * capability) {
     return hexagon_error;
 }
 
-static int ggmlhexagon_get_hvx_support_info(int domain, uint32_t * capability, uint32_t attr)
+static int ggmlhexagon_get_hvx_support_info(int domain, uint32_t attr, uint32_t * capability)
 {
     int hexagon_error = AEE_SUCCESS;
     *capability = 0;
@@ -1834,6 +1839,58 @@ static AEEResult ggmlhexagon_set_clocks(remote_handle64 handle, int32 power_leve
     return AEE_SUCCESS;
 }
 
+static void ggmlhexagon_probe_dspinfo(ggml_backend_qnn_context * ctx, size_t * rpcmem_capacity) {
+    size_t candidate_size   = 0;
+    uint8_t * rpc_buffer    = nullptr;
+    const int SIZE_IN_MB    = (1 << 20);
+    size_t probe_slots[]    = {1024, 1536, 2048 - 48, 2048};
+    size_t probe_counts     = sizeof(probe_slots) / sizeof(size_t);
+    for (size_t idx = 0; idx < probe_counts; idx++) {
+        rpc_buffer = static_cast<uint8_t *>(rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB)));
+        if (nullptr == rpc_buffer) {
+            GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
+            break;
+        } else {
+            candidate_size = probe_slots[idx];
+            rpcmem_free(rpc_buffer);
+            rpc_buffer = nullptr;
+        }
+    }
+
+    *rpcmem_capacity = candidate_size;
+    GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", *rpcmem_capacity);
+
+    uint32_t dsp_version = 0;
+    ggmlhexagon_get_hvx_arch_ver(ctx->domain_id, &dsp_version);
+
+    if (dsp_version == 0x68 || dsp_version == 0x69 || dsp_version == 0x73 || dsp_version == 0x75 || dsp_version == 0x79) {
+        GGMLQNN_LOG_DEBUG("dsp arch version 0x%x", dsp_version);
+    } else {
+        GGMLQNN_LOG_WARN("error: dsp arch version 0x%x is not supported", dsp_version);
+    }
+
+    uint32_t vtcm_count = 0;
+    uint32_t vtcm_page  = 0;
+    ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_COUNT, &vtcm_count);
+    ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_PAGE, &vtcm_page);
+    GGMLQNN_LOG_DEBUG("vtcm_count %d", vtcm_count);
+    GGMLQNN_LOG_DEBUG("vtcm_page %d", vtcm_page);
+
+    uint32_t hmx_depth = 0;
+    uint32_t hmx_spatial = 0;
+    ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_DEPTH, &hmx_depth);
+    ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_SPATIAL, &hmx_spatial);
+    GGMLQNN_LOG_DEBUG("hmx_depth %d", hmx_depth);
+    GGMLQNN_LOG_DEBUG("hmx_spatial %d", hmx_spatial);
+
+    uint32_t hvx_support_128b = 0;
+    ggmlhexagon_get_hvx_support_info(ctx->domain_id, HVX_SUPPORT_128B, &hvx_support_128b);
+    GGMLQNN_LOG_DEBUG("hvx_support_128b %d", hvx_support_128b);
+
+    GGMLQNN_LOG_DEBUG("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support());
+    GGMLQNN_LOG_DEBUG("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id));
+}
+
 static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
     int hexagon_error               = AEE_SUCCESS;
 
@@ -1931,6 +1988,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
         }
     }
 
+    ctx->domain_id = domain_id;
     GGMLQNN_LOG_INFO("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
     GGMLQNN_LOG_INFO("unsignedpd_enabled %d", is_unsignedpd_enabled);
     if (is_unsignedpd_enabled) {
@@ -1966,7 +2024,9 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
     hexagon_error = ggmlop_dsp_open(ggmlop_domain_uri, &ctx->ggmlop_handle);
     if (AEE_SUCCESS == hexagon_error) {
         GGMLQNN_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
-        GGMLQNN_LOG_INFO("only support GGML_OP_ADD on cDSP currently\n");
+        GGMLQNN_LOG_INFO("only support GGML_OP_ADD and GGML_OP_MUL_MAT on cDSP currently\n");
+        size_t rpcmem_size = 0;
+        ggmlhexagon_probe_dspinfo(ctx, &rpcmem_size);
         ggmlhexagon_set_clocks(ctx->ggmlop_handle, HAP_DCVS_V2_DUTY_CYCLE_MODE, 40, 1);
         ggmlhexagon_set_rpc_latency(domain_id, RPC_POLL_QOS, 1000);
     } else {
@@ -1983,9 +2043,10 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
 
     if (ctx->rpc_mempool) {
         rpcmem_free(ctx->rpc_mempool);
-        ctx->rpc_mempool = nullptr;
-        ctx->rpc_mempool_len = 0;
-        ctx->ggmlop_handle  = -1;
+        ctx->rpc_mempool        = nullptr;
+        ctx->rpc_mempool_len    = 0;
+        ctx->ggmlop_handle      = -1;
+        ctx->domain_id          = -1;
     }
 
     return -1;
@@ -2005,8 +2066,9 @@ static void ggmlhexagon_close_cdsp(ggml_backend_qnn_context * ctx) {
 
     if (ctx->rpc_mempool) {
         rpcmem_free(ctx->rpc_mempool);
-        ctx->rpc_mempool = nullptr;
-        ctx->rpc_mempool_len = 0;
+        ctx->rpc_mempool        = nullptr;
+        ctx->rpc_mempool_len    = 0;
+        ctx->domain_id          = -1;
     }
     GGMLQNN_LOG_DEBUG("leave %s", __func__);
 }
@@ -2019,20 +2081,15 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
 
     int hexagon_error               = AEE_SUCCESS;
     ggmlhexagon_op_func_t op_func   = nullptr;
-    void * wdata                    = nullptr;
-
     ggml_tensor * src0              = op->src[0];
-    //src1 might-be nullptr for some ggml op
     ggml_tensor * src1              = op->src[1];
     ggml_tensor * dst               = op;
-    ggml_type src0_type = src0->type;
 
     switch (op->op) {
         case GGML_OP_ADD:
             op_func = ggmlop_dsp_add;
             break;
         case GGML_OP_MUL_MAT: {
-            wdata = ggmlqnn_type_trait(ctx, op);
             op_func = ggmlop_dsp_mulmat;
             break;
         }
@@ -2040,18 +2097,12 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
             return;
     }
 
-    if ((GGML_OP_MUL_MAT == op->op) && (src0_type != GGML_TYPE_F32)) {
-        dsptensor_0.data        = wdata;
-        dsptensor_0.data_len    = ctx->desired_size;
-    } else {
-        dsptensor_0.data        = src0->data;
-        dsptensor_0.data_len    = ggml_nbytes(src0);
-    }
+    dsptensor_0.data     = src0->data;
+    dsptensor_0.data_len = ggml_nbytes(src0);
 
-    dsptensor_1.data = src1->data;
-    dsptensor_2.data = dst->data;
+    dsptensor_1.data     = src1->data;
+    dsptensor_2.data     = dst->data;
 
-    //make compiler happy
     dsptensor_0.ne[0] = src0->ne[0];
     dsptensor_0.ne[1] = src0->ne[1];
     dsptensor_0.ne[2] = src0->ne[2];
@@ -2086,10 +2137,6 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
     dsptensor_1.data_len = ggml_nbytes(src1);
     dsptensor_2.data_len = ggml_nbytes(dst);
 
-    if ((GGML_OP_MUL_MAT == op->op) && (src0_type != GGML_TYPE_F32)) {
-        dsptensor_0.data_len    = ctx->desired_size;
-    }
-
     dsptensor_0.type = src0->type;
     dsptensor_1.type = src1->type;
     dsptensor_2.type = dst->type;
@@ -4179,10 +4226,12 @@ static void ggmlqnn_load_cfg() {
     qnncfg_instance.get_intvalue("general", "dump_op_info", g_qnn_params.dump_op_info, 0);
     qnncfg_instance.get_intvalue("general", "inference_approach", g_qnn_params.inference_approach, 0);
     qnncfg_instance.get_intvalue("general", "qnn_backend", g_qnn_params.qnn_backend, 2);
-    qnncfg_instance.get_intvalue("npu", "hvx_threads", g_qnn_params.hvx_threads, 4);
-    qnncfg_instance.get_intvalue("npu", "vtcm_size_in_mb", g_qnn_params.vtcm_size_in_mb, 8);
-    qnncfg_instance.get_intvalue("npu", "enable_dlbc", g_qnn_params.enable_dlbc, 0);
-    qnncfg_instance.get_stringvalue("npu", "precision_mode", precision_mode, "fp32");
+    qnncfg_instance.get_intvalue("qnn", "hvx_threads", g_qnn_params.hvx_threads, 4);
+    qnncfg_instance.get_intvalue("qnn", "vtcm_size_in_mb", g_qnn_params.vtcm_size_in_mb, 8);
+    qnncfg_instance.get_intvalue("qnn", "enable_dlbc", g_qnn_params.enable_dlbc, 0);
+    qnncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32");
+    qnncfg_instance.get_intvalue("cdsp", "enable_mulmat_cdsp", g_qnn_params.enable_mulmat_cdsp, 0);
+    qnncfg_instance.get_intvalue("cdsp", "enable_q_mulmat", g_qnn_params.enable_q_mulmat, 0);
     GGMLQNN_LOG_INFO("print_qnn_internal_log=%d", g_qnn_params.print_qnn_internal_log);
     GGMLQNN_LOG_INFO("inference_approach=%d(%s)", g_qnn_params.inference_approach,
                      ggmlqnn_get_inference_approach_name(g_qnn_params.inference_approach));
@@ -4226,6 +4275,8 @@ static bool ggmlhexagon_can_handle_op(const ggml_backend_qnn_context * ctx, cons
     const int64_t ne00  = op_tensor->src[0]->ne[0];
     uint32_t src0_rank  = 0;
     uint32_t src1_rank  = 0;
+    bool support        = false;
+
     if (nullptr != src0) {
         src0_rank = ggml_n_dims(src0);
     }
@@ -4233,32 +4284,39 @@ static bool ggmlhexagon_can_handle_op(const ggml_backend_qnn_context * ctx, cons
         src1_rank = ggml_n_dims(src1);
     }
 
-    //TODO: only support offload GGML_OP_ADD and GGML_OP_MUL_MAT to cDSP directly
-    bool support =  ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
+    if (g_qnn_params.enable_mulmat_cdsp)
+        support = ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
+    else
+        support = (op_tensor->op == GGML_OP_ADD);
     if (!support)
         return false;
 
+    ggmlqnn_dump_op_info(op_tensor);
     switch (op_tensor->op) {
         case GGML_OP_ADD:
         {
             if (!ggml_are_same_shape(src0, src1)) {
                 return false;
             }
+
             return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
         }
-
         case GGML_OP_MUL_MAT:
         {
             ggmlqnn_dump_op_info(op_tensor);
 
-            if (src1_rank != 2)
+            //TODO:3d&4d matrix mulmat on cDSP
+            if (src0_rank != 2)
                 return false;
 
-            return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
-
+            if (g_qnn_params.enable_q_mulmat)
+                return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type))
+                       && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+            else
+                return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
         }
         default:
-            return false;
+            return ggmlqnn_same_types(ctx, op_tensor);
     }
 }
 
@@ -4597,8 +4655,6 @@ static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(
     if (nullptr == ctx->buffer) {
         GGMLQNN_LOG_WARN("%s: failed to allocate %d MiB\n", __func__, size / (1 << 20));
         return nullptr;
-    } else {
-        GGMLQNN_LOG_DEBUG("%s: allocate %d MiB\n", __func__, size_aligned / (1 << 20));
     }
 
     return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size);
@@ -4729,10 +4785,16 @@ static void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *
         *total = ggmlqnn_get_system_total_memory_in_bytes();
         *free = ggmlqnn_get_system_free_memory_in_bytes();
     } else if (QNN_BACKEND_NPU == ctx->device) {
-        size_t rpc_ion_memsize = ctx->instance->get_rpcmem_capacity();
-        size_t rpc_ion_usage = ctx->instance->get_rpcmem_usage();
-        GGMLQNN_LOG_DEBUG("rpc memsize %d", rpc_ion_memsize);
-        GGMLQNN_LOG_DEBUG("rpc usage %d", rpc_ion_usage);
+        size_t rpc_ion_memsize = 0;
+        size_t rpc_ion_usage   = 0;
+        if (DIRECT_USE_CDSP != g_qnn_params.inference_approach) {
+            rpc_ion_memsize = ctx->instance->get_rpcmem_capacity();
+            rpc_ion_usage   = ctx->instance->get_rpcmem_usage();
+        } else {
+            ggmlhexagon_probe_dspinfo(ctx, &rpc_ion_memsize);
+        }
+        GGMLQNN_LOG_DEBUG("rpc memsize %d M", rpc_ion_memsize);
+        GGMLQNN_LOG_DEBUG("rpc usage %d M", rpc_ion_usage);
         *total = rpc_ion_memsize * (1 << 20);
         *free = (rpc_ion_memsize - rpc_ion_usage) * (1 << 20);
     }
@@ -5078,9 +5140,12 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
         return g_qnn_mgr[device].backend;
     }
 
-    qnn_instance * instance  = ggmlqnn_init_qnn_instance(device, qnn_lib_path);
-    if (nullptr == instance)
-        return nullptr;
+    //don't initialize QNN when inference approach is offload ggml op to Hexagon cDSP directly
+    if (DIRECT_USE_CDSP != g_qnn_params.inference_approach) {
+        qnn_instance * instance = ggmlqnn_init_qnn_instance(device, qnn_lib_path);
+        if (nullptr == instance)
+            return nullptr;
+    }
 
     ggml_backend_qnn_interface.graph_compute = ggmlqnn_backend_graph_compute_general;
 
diff --git a/scripts/ggml-qnn.cfg b/scripts/ggml-qnn.cfg
index 9d47dba7a596a..513aecfc64862 100644
--- a/scripts/ggml-qnn.cfg
+++ b/scripts/ggml-qnn.cfg
@@ -1,7 +1,7 @@
 [general]
 #0: QNN-CPU backend
 #1: QNN-GPU backend
-#2: QNN-NPU backend
+#2: QNN-NPU backend / Hexagon cDSP
 #3: default ggml backend
 qnn_backend = 2
 
@@ -22,8 +22,16 @@ dump_op_info = 0
 # 2: special approach through QNN: mapping entire ggml cgraph to QNN graph
 inference_approach = 1
 
-[npu]
+#inference approach through QNN
+[qnn]
 hvx_threads = 4
 vtcm_size_in_mb = 8
 enable_dlbc = 1
 precision_mode = "fp16"
+
+#inference approach through cDSP
+[cdsp]
+#enable/disable offload mulmat to cDSP
+enable_mulmat_cdsp = 0
+#enable/disable offload fp32 & all quantized type mulmat to cDSP
+enable_q_mulmat = 0

From 1bb49f334cdd060158003c10dc8f2c673253b9e1 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 26 Mar 2025 22:42:03 +0800
Subject: [PATCH 137/200] ggml-qnn: refine code according to self code-review
 and make code more clear

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 95 +++++++++++++++++-----------------
 scripts/ggml-qnn.cfg           | 12 ++---
 2 files changed, 53 insertions(+), 54 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index c28d01d134b29..f882552a7fc1b 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -14,17 +14,17 @@
  * section-6  Hexagon DSP helper function
  * section-7  backend helper function / class
  * section-8  implementation of ggml-hexagon backend according to specification in ggml backend subsystem
- * section-9  implementation of general approach through QNN and Hexagon DSP
+ * section-9  implementation of hwaccel approach through QNN and Hexagon DSP
  *
  * currently provide following ggml op' implementation through QNN:
  * - GGML_OP_ADD/GGML_OP_SUB/GGML_OP_MUL/GGML_OP_DIV/GGML_OP_LOG/GGML_OP_SQRT:
- *   this is a simple skeleton, can expand other ggml ops according to expertise
+ *   this is a simple hwaccel skeleton, can expand other ggml ops according to expertise
  * - GGML_OP_MUL_MAT:
- *   this is a complicated skeleton, can expand other ggml ops accordingly
+ *   this is a complicated hwaccel skeleton, can expand other ggml ops accordingly
  *
  *  currently provide following ggml op' implementation through Hexagon DSP:
  * - GGML_OP_ADD & GGML_OP_MUL_MAT:
- *   this is a skeleton, can expand other ggml ops accordingly
+ *   this is a hwaccel skeleton, can expand other ggml ops accordingly
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
@@ -230,7 +230,7 @@ static void   ggmlqnn_compute_diag_mask(ggml_backend_qnn_context * ctx, ggml_ten
 
 #define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                              \
     do {                                                                        \
-        if (g_qnn_params.inference_approach != DIRECT_USE_CDSP) {               \
+        if (g_qnn_params.hwaccel_approach != HWACCEL_CDSP) {                    \
             if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {       \
                 return;                                                         \
             }                                                                   \
@@ -270,12 +270,12 @@ enum qnn_profile_level {
 };
 
 //0: general approach through QNN:offload ggmlop to QNN
-//1: general approach through Hexagon cDSP:offload ggmlop to Hexagon cDSP directly
-//2: special approach through QNN:mapping entire ggml cgraph to a single QNN graph
-enum inference_approach {
-    QNN_GENERAL     = 0,
-    DIRECT_USE_CDSP = 1,
-    QNN_SINGLEGRAPH = 2,
+//1: special approach through QNN-SINGLEGRAPH:mapping entire ggml cgraph to a single QNN graph
+//2: general approach through Hexagon cDSP:offload ggmlop to Hexagon cDSP directly
+enum hwaccel_approach_type {
+    HWACCEL_QNN                     = 0,
+    HWACCEL_QNN_SINGLEGRAPH         = 1,
+    HWACCEL_CDSP                    = 2,
 };
 
 enum hexagon_dsp_type {
@@ -362,7 +362,7 @@ struct qnn_parameter {
     int hvx_threads;
     int vtcm_size_in_mb;
     int enable_dlbc;
-    int inference_approach;     // 0: QNN_GENERAL     1: DIRECT_USE_CDSP 2: QNN_SINGELGRAPH
+    int hwaccel_approach;       // 0: HWACCEL_QNN 1: HWACCEL_QNN_SINGLEGRAPH 2: HWACCEL_CDSP
     int qnn_backend;            // 0: QNN-CPU backend 1: QNN-GPU backend 2: QNN-NPU backend
     int enable_mulmat_cdsp;     // enable/disable offload mulmat to cDSP
     int enable_q_mulmat;        // enable/disable offload fp32 & all quantized type mulmat to cDSP
@@ -382,8 +382,8 @@ static struct qnn_parameter g_qnn_params = {
         .hvx_threads            = 4,
         .vtcm_size_in_mb        = 8,
         .enable_dlbc            = 1,
-        .inference_approach     = 0,
-        .qnn_backend            = 2, //default is QNN-NPU backend
+        .hwaccel_approach       = HWACCEL_CDSP,
+        .qnn_backend            = QNN_BACKEND_NPU,
         .enable_mulmat_cdsp     = 0,
         .enable_q_mulmat        = 0,
         .qnn_cfgfilename        = "ggml-qnn.cfg",
@@ -1578,13 +1578,12 @@ static void ggmlhexagon_set_rpc_latency(int domain, int qos, int latency) {
 
     if (remote_handle_control) {
         struct remote_rpc_control_latency data;
-#if 1
-        data.enable  = RPC_PM_QOS;
-        data.latency = 300;
-#else
-        data.enable = RPC_POLL_QOS;
-        data.latency = 1000;
-#endif
+/*
+        qos          |  latency
+        -----------------------
+        RPC_PM_QOS   |  300
+        RPC_POLL_QOS |  1000
+*/
         data.enable   = qos;
         data.latency  = latency;
         hexagon_error = remote_handle64_control(DSPRPC_GET_DSP_INFO, DSPRPC_CONTROL_LATENCY, (void*)&data, sizeof(data));
@@ -1926,7 +1925,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
     }
 
     if (-1 == domain_id) {
-        if (NULL != domain_type) {
+        if (nullptr != domain_type) {
             if ((strcmp(domain_type, "NSP") != 0 && strcmp(domain_type, "HPASS") != 0)) {
                 GGMLQNN_LOG_WARN("invalid domain_type %s. possible values are NSP or HPASS", domain_type);
                 goto bail;
@@ -2188,16 +2187,16 @@ static const char * ggmlqnn_get_htparch_desc(size_t htp_arch) {
     }
 }
 
-static const char * ggmlqnn_get_inference_approach_name(int inference_approach) {
-    switch (inference_approach) {
-        case QNN_GENERAL:
-            return "QNN_GENERAL";
-        case DIRECT_USE_CDSP:
-            return "DIRECT_USE_CDSP";
-        case QNN_SINGLEGRAPH:
-            return "QNN_SINGLEGRAPH";
+static const char * ggmlqnn_get_hwaccel_approach_name(int hwaccle_approach) {
+    switch (hwaccle_approach) {
+        case HWACCEL_QNN:
+            return "HWACCEL_QNN";
+        case HWACCEL_QNN_SINGLEGRAPH:
+            return "HWACCEL_QNN_SINGLEGRAPH";
+        case HWACCEL_CDSP:
+            return "HWACCEL_CDSP";
         default:
-            return "unknown approach";
+            return "unknown hwaccel approach";
     }
 }
 
@@ -3996,7 +3995,7 @@ void qnn_instance::htp_enter_performance_mode() {
 }
 
 static void ggmlqnn_set_runtime_path(size_t device, const std::string & path) {
-    if ((QNN_BACKEND_NPU == device) || (DIRECT_USE_CDSP == g_qnn_params.inference_approach)) {
+    if ((QNN_BACKEND_NPU == device) || (HWACCEL_CDSP == g_qnn_params.hwaccel_approach)) {
         if (0 == setenv("LD_LIBRARY_PATH",
                         (path +
                          ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
@@ -4224,7 +4223,7 @@ static void ggmlqnn_load_cfg() {
     qnncfg_instance.get_intvalue("general", "enable_perf", g_qnn_params.enable_perf, 0);
     qnncfg_instance.get_intvalue("general", "print_tensors_info", g_qnn_params.print_tensors_info, 0);
     qnncfg_instance.get_intvalue("general", "dump_op_info", g_qnn_params.dump_op_info, 0);
-    qnncfg_instance.get_intvalue("general", "inference_approach", g_qnn_params.inference_approach, 0);
+    qnncfg_instance.get_intvalue("general", "hwaccel_approach", g_qnn_params.hwaccel_approach, 0);
     qnncfg_instance.get_intvalue("general", "qnn_backend", g_qnn_params.qnn_backend, 2);
     qnncfg_instance.get_intvalue("qnn", "hvx_threads", g_qnn_params.hvx_threads, 4);
     qnncfg_instance.get_intvalue("qnn", "vtcm_size_in_mb", g_qnn_params.vtcm_size_in_mb, 8);
@@ -4233,8 +4232,8 @@ static void ggmlqnn_load_cfg() {
     qnncfg_instance.get_intvalue("cdsp", "enable_mulmat_cdsp", g_qnn_params.enable_mulmat_cdsp, 0);
     qnncfg_instance.get_intvalue("cdsp", "enable_q_mulmat", g_qnn_params.enable_q_mulmat, 0);
     GGMLQNN_LOG_INFO("print_qnn_internal_log=%d", g_qnn_params.print_qnn_internal_log);
-    GGMLQNN_LOG_INFO("inference_approach=%d(%s)", g_qnn_params.inference_approach,
-                     ggmlqnn_get_inference_approach_name(g_qnn_params.inference_approach));
+    GGMLQNN_LOG_INFO("hwaccel_approach=%d(%s)", g_qnn_params.hwaccel_approach,
+                     ggmlqnn_get_hwaccel_approach_name(g_qnn_params.hwaccel_approach));
     GGMLQNN_LOG_INFO("qnn_backend=%d", g_qnn_params.qnn_backend);
     GGMLQNN_LOG_INFO("npu inference precision mode=%s", precision_mode.c_str());
     GGMLQNN_LOG_INFO("qnn runtime lib path=%s", g_qnn_params.qnn_runtimelib_path);
@@ -4325,7 +4324,7 @@ static bool ggmlqnn_can_handle_op(const ggml_backend_qnn_context * ctx, const st
         return true;
     }
 
-    if (DIRECT_USE_CDSP == g_qnn_params.inference_approach) {
+    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
         return ggmlhexagon_can_handle_op(ctx, op_tensor);
     }
 
@@ -4686,7 +4685,7 @@ static void ggml_backend_qnn_free(ggml_backend_t backend) {
     ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *)backend->context;
     GGMLQNN_LOG_DEBUG("device idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name);
 
-    if (DIRECT_USE_CDSP == g_qnn_params.inference_approach) {
+    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
         ggmlhexagon_close_cdsp(ctx);
     }
 
@@ -4787,7 +4786,7 @@ static void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t *
     } else if (QNN_BACKEND_NPU == ctx->device) {
         size_t rpc_ion_memsize = 0;
         size_t rpc_ion_usage   = 0;
-        if (DIRECT_USE_CDSP != g_qnn_params.inference_approach) {
+        if (HWACCEL_CDSP != g_qnn_params.hwaccel_approach) {
             rpc_ion_memsize = ctx->instance->get_rpcmem_capacity();
             rpc_ion_usage   = ctx->instance->get_rpcmem_usage();
         } else {
@@ -5013,8 +5012,8 @@ ggml_backend_reg_t ggml_backend_qnn_reg() {
 
     //case-2: normal scenario, such as llama-cli or UI applicaton
     ggmlqnn_load_cfg();
-    GGMLQNN_LOG_INFO("inference approach=%d(%s)", g_qnn_params.inference_approach,
-                     ggmlqnn_get_inference_approach_name(g_qnn_params.inference_approach));
+    GGMLQNN_LOG_INFO("inference approach=%d(%s)", g_qnn_params.hwaccel_approach,
+                     ggmlqnn_get_hwaccel_approach_name(g_qnn_params.hwaccel_approach));
     GGMLQNN_LOG_INFO("user's specified qnn_backend=%d", g_qnn_params.qnn_backend);
     GGMLQNN_LOG_INFO("user's specified qnn runtime lib path=%s", g_qnn_params.qnn_runtimelib_path);
     if (g_qnn_params.qnn_backend >= GGML_QNN_MAX_DEVICES) {
@@ -5053,7 +5052,7 @@ ggml_backend_reg_t ggml_backend_qnn_reg() {
 }
 
 const char * ggml_backend_qnn_get_devname(size_t dev_num) {
-    if (DIRECT_USE_CDSP == g_qnn_params.inference_approach) {
+    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
         if (dev_num == QNN_BACKEND_GGML)
             return "ggml";
         else
@@ -5076,8 +5075,8 @@ const char * ggml_backend_qnn_get_devname(size_t dev_num) {
 
 static qnn_instance * ggmlqnn_init_qnn_instance(size_t device, const char * qnn_lib_path) {
     int result = 0;
-    GGMLQNN_LOG_INFO("inference approach=%d(%s)", g_qnn_params.inference_approach,
-                     ggmlqnn_get_inference_approach_name(g_qnn_params.inference_approach));
+    GGMLQNN_LOG_INFO("inference approach=%d(%s)", g_qnn_params.hwaccel_approach,
+                     ggmlqnn_get_hwaccel_approach_name(g_qnn_params.hwaccel_approach));
 
     qnn_instance * instance = nullptr;
     instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, "");
@@ -5141,7 +5140,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
     }
 
     //don't initialize QNN when inference approach is offload ggml op to Hexagon cDSP directly
-    if (DIRECT_USE_CDSP != g_qnn_params.inference_approach) {
+    if (HWACCEL_CDSP != g_qnn_params.hwaccel_approach) {
         qnn_instance * instance = ggmlqnn_init_qnn_instance(device, qnn_lib_path);
         if (nullptr == instance)
             return nullptr;
@@ -5157,14 +5156,14 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
     };
 
     g_qnn_mgr[device].backend = qnn_backend;
-    if (DIRECT_USE_CDSP == g_qnn_params.inference_approach) {
+    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
         int result = ggmlhexagon_init_dsp(&g_qnn_mgr[device]);
         if (0 != result) {
             GGMLQNN_LOG_INFO("init hexagon dsp failure");
             ggml_backend_qnn_free(qnn_backend);
             return nullptr;
         }
-        //ensure test-backend-ops get the correct backend name when inference approach is 1(DIRECT_USE_CDSP)
+        //ensure test-backend-ops get the correct backend name when inference approach is 1(HWACCEL_CDSP)
         memcpy(g_qnn_mgr[device].name, "Hexagon-cDSP", strlen("Hexagon-cDSP"));
     }
 
@@ -5237,7 +5236,7 @@ static void ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_ten
     qnn_perf op_perf                            = qnn_perf(graph_name);
     op_perf.start();
 
-    if (DIRECT_USE_CDSP == g_qnn_params.inference_approach) {
+    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
         ggmlhexagon_compute(ctx, op);
         op_perf.info();
         return;
@@ -5629,7 +5628,7 @@ static void ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor
     qnn_perf op_perf                            = qnn_perf(graph_name);
     op_perf.start();
 
-    if (DIRECT_USE_CDSP == g_qnn_params.inference_approach) {
+    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
         ggmlhexagon_compute(ctx, op);
         op_perf.info();
         return;
diff --git a/scripts/ggml-qnn.cfg b/scripts/ggml-qnn.cfg
index 513aecfc64862..17bd8f6a4b1ca 100644
--- a/scripts/ggml-qnn.cfg
+++ b/scripts/ggml-qnn.cfg
@@ -17,19 +17,19 @@ print_tensors_info = 0
 # enable/disable dump op info in handle_op
 dump_op_info = 0
 
-# 0: general approach through QNN
-# 1: general approach through Hexagon cDSP
-# 2: special approach through QNN: mapping entire ggml cgraph to QNN graph
-inference_approach = 1
+# 0: hwaccel approach through QNN
+# 1: hwaccel approach through QNN-SINGLEGRAPH: mapping entire ggml cgraph to a single QNN graph
+# 2: hwaccel approach through Hexagon cDSP
+hwaccel_approach = 2
 
-#inference approach through QNN
+#hwaccel approach through QNN
 [qnn]
 hvx_threads = 4
 vtcm_size_in_mb = 8
 enable_dlbc = 1
 precision_mode = "fp16"
 
-#inference approach through cDSP
+#hwaccel approach through cDSP
 [cdsp]
 #enable/disable offload mulmat to cDSP
 enable_mulmat_cdsp = 0

From e69955ed23f9400e0f09acb044e21bb052f034f7 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 27 Mar 2025 20:19:40 +0800
Subject: [PATCH 138/200] ggml-qnn: offload more ggml op to Hexagon cDSP

---
 ggml/src/ggml-qnn/ggml-qnn.cpp               | 334 +++++++------
 ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c   |  61 ++-
 ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h   |  14 +-
 ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c      | 477 ++++++++++++++++---
 ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c |  61 ++-
 scripts/ggml-qnn.cfg                         |   2 +-
 6 files changed, 728 insertions(+), 221 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index f882552a7fc1b..dcc332042bdc1 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -130,10 +130,8 @@ class  qnn_instance;
 struct qnn_parameter;
 struct ggml_backend_qnn_context;
 
-static void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op);
 static void   ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void   ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-
 static void   ggmlqnn_compute_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void   ggmlqnn_compute_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void   ggmlqnn_compute_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
@@ -159,6 +157,10 @@ static void   ggmlqnn_compute_upsample_nearest2d(ggml_backend_qnn_context * ctx,
 static void   ggmlqnn_compute_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
 static void   ggmlqnn_compute_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value);
 
+static size_t ggmlqnn_get_op_index(const ggml_tensor * tensor);
+static void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op);
+static void   ggmlqnn_get_opkey_from_op(const ggml_tensor * op, std::string & output);
+
 #if 0//def NDEBUG
 #define GGMLQNN_DEBUG                                   0
 #else
@@ -349,8 +351,16 @@ struct ggml_backend_qnn_context {
 struct qnn_op_caps {
     bool supported;
     ggml_op op;
+    const size_t input_param_count;
     const char * qnn_op_name;
+};
+
+struct hexagon_op_caps {
+    bool supported;
+    ggml_op op;
     const size_t input_param_count;
+    const char * hexagon_op_name;
+    ggmlhexagon_op_func_t dsp_op_func;
 };
 
 struct qnn_parameter {
@@ -531,18 +541,19 @@ static domain hexagon_supported_domains[] = {
         {CDSP1_DOMAIN_ID, CDSP1_DOMAIN}
 };
 
+//supported ggml op by HWACCEL_QNN
 static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = {
-        {true,  GGML_OP_NONE, nullptr, 0},
+        {true,  GGML_OP_NONE, 0},
         {false, GGML_OP_DUP},
-        {true,  GGML_OP_ADD, QNN_OP_ELEMENT_WISE_ADD, 2},
+        {true,  GGML_OP_ADD, 2, QNN_OP_ELEMENT_WISE_ADD},
         {false, GGML_OP_ADD1},
         {false, GGML_OP_ACC},
-        {true,  GGML_OP_SUB, QNN_OP_ELEMENT_WISE_SUBTRACT, 2},
-        {true,  GGML_OP_MUL, QNN_OP_ELEMENT_WISE_MULTIPLY, 2},
-        {true,  GGML_OP_DIV, QNN_OP_ELEMENT_WISE_DIVIDE, 2},
+        {true,  GGML_OP_SUB, 2, QNN_OP_ELEMENT_WISE_SUBTRACT},
+        {true,  GGML_OP_MUL, 2, QNN_OP_ELEMENT_WISE_MULTIPLY},
+        {true,  GGML_OP_DIV, 2, QNN_OP_ELEMENT_WISE_DIVIDE},
         {false, GGML_OP_SQR},
-        {true,  GGML_OP_SQRT, QNN_OP_ELEMENT_WISE_SQUARE_ROOT, 1},
-        {true,  GGML_OP_LOG, QNN_OP_ELEMENT_WISE_LOG, 1},
+        {true,  GGML_OP_SQRT, 1, QNN_OP_ELEMENT_WISE_SQUARE_ROOT},
+        {true,  GGML_OP_LOG, 1, QNN_OP_ELEMENT_WISE_LOG},
         {false, GGML_OP_SIN},
         {false, GGML_OP_COS},
         {false, GGML_OP_SUM},
@@ -559,7 +570,7 @@ static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = {
         {false, GGML_OP_RMS_NORM_BACK},
         {false, GGML_OP_GROUP_NORM},
         {false, GGML_OP_L2_NORM},
-        {true,  GGML_OP_MUL_MAT, QNN_OP_MAT_MUL, 2},
+        {true,  GGML_OP_MUL_MAT, 2, QNN_OP_MAT_MUL},
         {false, GGML_OP_MUL_MAT_ID},
         {false, GGML_OP_OUT_PROD},
         {false, GGML_OP_SCALE},
@@ -638,7 +649,117 @@ static_assert(ggmlqnn_k_op_caps[GGML_OP_ADD].supported,     "GGML_OP_ADD is not
 static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL].supported,     "GGML_OP_MUL is not true");
 static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL_MAT].supported, "GGML_OP_MUL_MAT is not true");
 static_assert(std::size(ggmlqnn_k_op_caps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
-        "pls check ggmlqnn_k_op_caps and ensure is corresponding to latest ggml.h");
+              "pls check ggmlqnn_k_op_caps and ensure is corresponding to latest ggml.h");
+
+//supported ggml op by HWACCEL_CDSP
+static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
+        {true,  GGML_OP_NONE, 0},
+        {false, GGML_OP_DUP},
+        {true,  GGML_OP_ADD, 2, "ggmlop_dsp_add", ggmlop_dsp_add},
+        {false, GGML_OP_ADD1},
+        {false, GGML_OP_ACC},
+        {true,  GGML_OP_SUB, 2, "ggmlop_dsp_sub", ggmlop_dsp_sub},
+        {true,  GGML_OP_MUL, 2, "ggmlop_dsp_mul", ggmlop_dsp_mul},
+        {true,  GGML_OP_DIV, 2, "ggmlop_dsp_div", ggmlop_dsp_div},
+        {false, GGML_OP_SQR},
+        {false,  GGML_OP_SQRT},
+        {false,  GGML_OP_LOG},
+        {false, GGML_OP_SIN},
+        {false, GGML_OP_COS},
+        {false, GGML_OP_SUM},
+        {false, GGML_OP_SUM_ROWS},
+        {false, GGML_OP_MEAN},
+        {false, GGML_OP_ARGMAX},
+        {false, GGML_OP_COUNT_EQUAL},
+        {false, GGML_OP_REPEAT},
+        {false, GGML_OP_REPEAT_BACK},
+        {false, GGML_OP_CONCAT},
+        {false, GGML_OP_SILU_BACK},
+        {false, GGML_OP_NORM},
+        {false, GGML_OP_RMS_NORM},
+        {false, GGML_OP_RMS_NORM_BACK},
+        {false, GGML_OP_GROUP_NORM},
+        {false, GGML_OP_L2_NORM},
+        {true,  GGML_OP_MUL_MAT, 2, "ggmlop_dsp_mulmat", ggmlop_dsp_mulmat},
+        {false, GGML_OP_MUL_MAT_ID},
+        {false, GGML_OP_OUT_PROD},
+        {false, GGML_OP_SCALE},
+        {false, GGML_OP_SET},
+        {false, GGML_OP_CPY},
+        {false, GGML_OP_CONT},
+        {false, GGML_OP_RESHAPE},
+        {false, GGML_OP_VIEW},
+        {false, GGML_OP_PERMUTE},
+        {false, GGML_OP_TRANSPOSE},
+        {false, GGML_OP_GET_ROWS},
+        {false, GGML_OP_GET_ROWS_BACK},
+        {false, GGML_OP_DIAG},
+        {false, GGML_OP_DIAG_MASK_INF},
+        {false, GGML_OP_DIAG_MASK_ZERO},
+        {false, GGML_OP_SOFT_MAX},
+        {false, GGML_OP_SOFT_MAX_BACK},
+        {false, GGML_OP_ROPE},
+        {false, GGML_OP_ROPE_BACK},
+        {false, GGML_OP_CLAMP},
+        {false, GGML_OP_CONV_TRANSPOSE_1D},
+        {false, GGML_OP_IM2COL},
+        {false, GGML_OP_IM2COL_BACK},
+        {false, GGML_OP_CONV_TRANSPOSE_2D},
+        {false, GGML_OP_POOL_1D},
+        {false, GGML_OP_POOL_2D},
+        {false, GGML_OP_POOL_2D_BACK},
+        {false, GGML_OP_UPSCALE},
+        {false, GGML_OP_PAD},
+        {false, GGML_OP_PAD_REFLECT_1D},
+        {false, GGML_OP_ARANGE},
+        {false, GGML_OP_TIMESTEP_EMBEDDING},
+        {false, GGML_OP_ARGSORT},
+        {false, GGML_OP_LEAKY_RELU},
+        {false, GGML_OP_FLASH_ATTN_EXT},
+        {false, GGML_OP_FLASH_ATTN_BACK},
+        {false, GGML_OP_SSM_CONV},
+        {false, GGML_OP_SSM_SCAN},
+        {false, GGML_OP_WIN_PART},
+        {false, GGML_OP_WIN_UNPART},
+        {false, GGML_OP_GET_REL_POS},
+        {false, GGML_OP_ADD_REL_POS},
+        {false, GGML_OP_RWKV_WKV6},
+        {false, GGML_OP_GATED_LINEAR_ATTN},
+        {false, GGML_OP_RWKV_WKV7},
+        {false, GGML_OP_UNARY},
+        {false, GGML_OP_MAP_UNARY},
+        {false, GGML_OP_MAP_BINARY},
+        {false, GGML_OP_MAP_CUSTOM1_F32},
+        {false, GGML_OP_MAP_CUSTOM2_F32},
+        {false, GGML_OP_MAP_CUSTOM3_F32},
+        {false, GGML_OP_MAP_CUSTOM1},
+        {false, GGML_OP_MAP_CUSTOM2},
+        {false, GGML_OP_MAP_CUSTOM3},
+        {false, GGML_OP_CROSS_ENTROPY_LOSS},
+        {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK},
+        {false, GGML_OP_OPT_STEP_ADAMW},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_ABS)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SGN)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_NEG)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_STEP)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_TANH)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_ELU)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_RELU)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SIGMOID)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU_QUICK)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SILU)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSWISH)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSIGMOID)},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_EXP)}
+};
+
+static_assert(ggmlhexagon_k_op_caps[GGML_OP_NONE].supported,    "GGML_OP_NONE is not true");
+static_assert(ggmlhexagon_k_op_caps[GGML_OP_ADD].supported,     "GGML_OP_ADD is not true");
+static_assert(ggmlhexagon_k_op_caps[GGML_OP_MUL].supported,     "GGML_OP_MUL is not true");
+static_assert(ggmlhexagon_k_op_caps[GGML_OP_MUL_MAT].supported, "GGML_OP_MUL_MAT is not true");
+static_assert(std::size(ggmlhexagon_k_op_caps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
+              "pls check ggmlhexagon_k_op_caps and ensure is corresponding to latest ggml.h");
 
 // =================================================================================================
 //  section-2: ggml-qnn internal troubleshooting function/class
@@ -1787,57 +1908,6 @@ static int ggmlhexagon_request_status_notifications(int domain_id, void * contex
     return hexagon_error;
 }
 
-//TODO:not work on cDSP currently, this function will affect the performance of cDSP
-static AEEResult ggmlhexagon_set_clocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled) {
-#if 0
-    GGMLQNN_LOG_DEBUG("----------- entering power set clocks");
-
-    HAP_power_request_t request;
-    memset(&request, 0, sizeof(HAP_power_request_t));
-    request.type = HAP_power_set_apptype;
-    request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
-
-    void * benchmark_ctx = (void*)(handle);
-    int retval = HAP_power_set(benchmark_ctx, &request);
-    if (retval)  {
-        GGMLQNN_LOG_WARN("failed first power vote");
-        return AEE_EFAILED;
-    }
-
-    //configure clocks & DCVS mode
-    memset(&request, 0, sizeof(HAP_power_request_t));
-    request.type = HAP_power_set_DCVS_v2;
-    request.dcvs_v2.dcvs_enable = TRUE;
-    request.dcvs_v2.dcvs_params.target_corner = (HAP_dcvs_voltage_corner_t)power_level;
-    if (dcvs_enabled) {
-        request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_DISABLE;
-        request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_DISABLE;
-    } else {
-        request.dcvs_v2.dcvs_params.min_corner = request.dcvs_v2.dcvs_params.target_corner;
-        request.dcvs_v2.dcvs_params.max_corner = request.dcvs_v2.dcvs_params.target_corner;
-    }
-    request.dcvs_v2.dcvs_option     = HAP_DCVS_V2_PERFORMANCE_MODE;
-    request.dcvs_v2.set_dcvs_params = TRUE;
-    request.dcvs_v2.set_latency     = TRUE;
-    request.dcvs_v2.latency         = latency;
-    retval = HAP_power_set(benchmark_ctx, &request);
-    if (retval) {
-        GGMLQNN_LOG_WARN("failed to vote for performance mode");
-        return AEE_EFAILED;
-    }
-
-    memset(&request, 0, sizeof(HAP_power_request_t));
-    request.type = HAP_power_set_HVX;
-    request.hvx.power_up = TRUE;
-    retval = HAP_power_set(benchmark_ctx, &request);
-    if (retval) {
-        GGMLQNN_LOG_WARN("failed to vote for HVX power");
-        return AEE_EFAILED;
-    }
-#endif
-    return AEE_SUCCESS;
-}
-
 static void ggmlhexagon_probe_dspinfo(ggml_backend_qnn_context * ctx, size_t * rpcmem_capacity) {
     size_t candidate_size   = 0;
     uint8_t * rpc_buffer    = nullptr;
@@ -2024,9 +2094,9 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
     if (AEE_SUCCESS == hexagon_error) {
         GGMLQNN_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
         GGMLQNN_LOG_INFO("only support GGML_OP_ADD and GGML_OP_MUL_MAT on cDSP currently\n");
+        ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_V2_DUTY_CYCLE_MODE, 40, 1);
         size_t rpcmem_size = 0;
         ggmlhexagon_probe_dspinfo(ctx, &rpcmem_size);
-        ggmlhexagon_set_clocks(ctx->ggmlop_handle, HAP_DCVS_V2_DUTY_CYCLE_MODE, 40, 1);
         ggmlhexagon_set_rpc_latency(domain_id, RPC_POLL_QOS, 1000);
     } else {
         GGMLQNN_LOG_INFO("error 0x%x: failed to open domain %d(%s)", hexagon_error, domain_id,
@@ -2077,30 +2147,30 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
     struct dsptensor dsptensor_0;
     struct dsptensor dsptensor_1;
     struct dsptensor dsptensor_2;
+    std::string op_name;
+    ggmlqnn_get_opkey_from_op(op, op_name);
+
+    qnn_perf op_perf(op_name);
+    op_perf.start();
 
     int hexagon_error               = AEE_SUCCESS;
     ggmlhexagon_op_func_t op_func   = nullptr;
-    ggml_tensor * src0              = op->src[0];
-    ggml_tensor * src1              = op->src[1];
-    ggml_tensor * dst               = op;
+    size_t input_tensor_count       = 2;
 
-    switch (op->op) {
-        case GGML_OP_ADD:
-            op_func = ggmlop_dsp_add;
-            break;
-        case GGML_OP_MUL_MAT: {
-            op_func = ggmlop_dsp_mulmat;
-            break;
-        }
-        default:
-            return;
-    }
+    ggml_tensor * src0  = op->src[0];
+    ggml_tensor * src1  = op->src[1];
+    ggml_tensor * dst   = op;
 
-    dsptensor_0.data     = src0->data;
-    dsptensor_0.data_len = ggml_nbytes(src0);
+    input_tensor_count  =  ggmlhexagon_k_op_caps[ggmlqnn_get_op_index(op)].input_param_count;
+    op_func             =  ggmlhexagon_k_op_caps[ggmlqnn_get_op_index(op)].dsp_op_func;
+    if (nullptr == op_func) {
+        GGMLQNN_LOG_DEBUG("op GGML_OP_%s and dsp func %s not supported on cCSP", ggml_op_name(op->op), ggmlhexagon_k_op_caps[ggmlqnn_get_op_index(op)].hexagon_op_name);
+        return;
+    }
 
-    dsptensor_1.data     = src1->data;
-    dsptensor_2.data     = dst->data;
+    dsptensor_0.data        = src0->data;
+    dsptensor_0.data_len    = ggml_nbytes(src0);
+    dsptensor_0.type        = src0->type;
 
     dsptensor_0.ne[0] = src0->ne[0];
     dsptensor_0.ne[1] = src0->ne[1];
@@ -2112,15 +2182,25 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
     dsptensor_0.nb[2] = src0->nb[2];
     dsptensor_0.nb[3] = src0->nb[3];
 
-    dsptensor_1.ne[0] = src1->ne[0];
-    dsptensor_1.ne[1] = src1->ne[1];
-    dsptensor_1.ne[2] = src1->ne[2];
-    dsptensor_1.ne[3] = src1->ne[3];
+    if (2 == input_tensor_count) {
+        dsptensor_1.data        = src1->data;
+        dsptensor_1.type        = src1->type;
+        dsptensor_1.data_len    = ggml_nbytes(src1);
+
+        dsptensor_1.ne[0] = src1->ne[0];
+        dsptensor_1.ne[1] = src1->ne[1];
+        dsptensor_1.ne[2] = src1->ne[2];
+        dsptensor_1.ne[3] = src1->ne[3];
+
+        dsptensor_1.nb[0] = src1->nb[0];
+        dsptensor_1.nb[1] = src1->nb[1];
+        dsptensor_1.nb[2] = src1->nb[2];
+        dsptensor_1.nb[3] = src1->nb[3];
+    }
 
-    dsptensor_1.nb[0] = src1->nb[0];
-    dsptensor_1.nb[1] = src1->nb[1];
-    dsptensor_1.nb[2] = src1->nb[2];
-    dsptensor_1.nb[3] = src1->nb[3];
+    dsptensor_2.data        = dst->data;
+    dsptensor_2.data_len    = ggml_nbytes(dst);
+    dsptensor_2.type        = dst->type;
 
     dsptensor_2.ne[0] = dst->ne[0];
     dsptensor_2.ne[1] = dst->ne[1];
@@ -2132,18 +2212,16 @@ static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tens
     dsptensor_2.nb[2] = dst->nb[2];
     dsptensor_2.nb[3] = dst->nb[3];
 
-    dsptensor_0.data_len = ggml_nbytes(src0);
-    dsptensor_1.data_len = ggml_nbytes(src1);
-    dsptensor_2.data_len = ggml_nbytes(dst);
-
-    dsptensor_0.type = src0->type;
-    dsptensor_1.type = src1->type;
-    dsptensor_2.type = dst->type;
-
+    //GGMLQNN_DUMP_DSPTENSOR(&dsptensor_0);
+    //GGMLQNN_DUMP_DSPTENSOR(&dsptensor_1);
+    //GGMLQNN_DUMP_DSPTENSOR(&dsptensor_2);
     hexagon_error = op_func(ctx->ggmlop_handle, &dsptensor_0, &dsptensor_1, &dsptensor_2);
     if (AEE_SUCCESS != hexagon_error) {
         GGMLQNN_LOG_WARN("ggmlop %s computation fail on cdsp", ggml_op_name(op->op));
     }
+
+    op_perf.info();
+    return;
 }
 
 // =================================================================================================
@@ -2386,7 +2464,7 @@ static size_t ggmlqnn_get_op_input_param_count(const ggml_tensor * op) {
     return ggmlqnn_k_op_caps[op_index].input_param_count;
 }
 
-static void ggmlqnn_get_graphkey_from_op(const ggml_tensor * op, std::string & output) {
+static void ggmlqnn_get_opkey_from_op(const ggml_tensor * op, std::string & output) {
     GGML_ASSERT(op->op != GGML_OP_NONE);
     output += ggml_op_desc(op);
     output += ggmlqnn_get_ggml_type_name(op->type);
@@ -2434,7 +2512,7 @@ static void ggmlqnn_get_graphkey_from_cgraph(const ggml_cgraph * cgraph, std::st
         }
 
         if (is_start) {
-            ggmlqnn_get_graphkey_from_op(cgraph->nodes[0], output);
+            ggmlqnn_get_opkey_from_op(cgraph->nodes[0], output);
             is_start = false;
         } else {
             output += '#';
@@ -4269,36 +4347,38 @@ static bool ggmlqnn_same_types(const ggml_backend_qnn_context * ctx, const ggml_
 }
 
 static bool ggmlhexagon_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * op_tensor) {
+    ggmlqnn_dump_op_info(op_tensor);
+    if (!ggmlhexagon_k_op_caps[ggmlqnn_get_op_index(op_tensor)].supported) {
+        return false;
+    }
+
     struct ggml_tensor * src0 = op_tensor->src[0];
     struct ggml_tensor * src1 = op_tensor->src[1];
     const int64_t ne00  = op_tensor->src[0]->ne[0];
-    uint32_t src0_rank  = 0;
+    uint32_t src0_rank  = ggml_n_dims(src0);
     uint32_t src1_rank  = 0;
-    bool support        = false;
-
-    if (nullptr != src0) {
-        src0_rank = ggml_n_dims(src0);
-    }
     if (nullptr != src1) {
         src1_rank = ggml_n_dims(src1);
     }
 
+    //available in the early stage, should be removed in the product stage
+    bool support = false;
     if (g_qnn_params.enable_mulmat_cdsp)
         support = ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
     else
         support = (op_tensor->op == GGML_OP_ADD);
-    if (!support)
+    if (!support) {
         return false;
+    }
 
-    ggmlqnn_dump_op_info(op_tensor);
     switch (op_tensor->op) {
         case GGML_OP_ADD:
+        case GGML_OP_SUB:
         {
             if (!ggml_are_same_shape(src0, src1)) {
                 return false;
             }
-
-            return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+            break;
         }
         case GGML_OP_MUL_MAT:
         {
@@ -4315,8 +4395,9 @@ static bool ggmlhexagon_can_handle_op(const ggml_backend_qnn_context * ctx, cons
                 return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
         }
         default:
-            return ggmlqnn_same_types(ctx, op_tensor);
+            break;
     }
+    return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
 }
 
 static bool ggmlqnn_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * op_tensor) {
@@ -4409,6 +4490,11 @@ static bool ggmlqnn_compute_forward(ggml_backend_t backend, struct ggml_tensor *
     ggmlqnn_op_func_t func          = nullptr;
     ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *)backend->context;
 
+    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
+        ggmlhexagon_compute(ctx, dst);
+        return true;
+    }
+
     switch (dst->op) {
         case GGML_OP_REPEAT:
             ggmlqnn_compute_repeat(ctx, dst);
@@ -4544,12 +4630,7 @@ struct ggml_backend_qnn_buffer_context {
             free(sub_buffer);
         }
 
-        for (auto * qnn_tensor : qnn_tensors) {
-            free_qnn_tensor(qnn_tensor);
-        }
-
         sub_buffers.clear();
-        qnn_tensors.clear();
     }
     void * buffer       = nullptr;
 
@@ -4557,7 +4638,6 @@ struct ggml_backend_qnn_buffer_context {
 
     size_t buffer_size  = 0;
     std::vector<void *> sub_buffers;
-    std::vector<Qnn_Tensor_t *> qnn_tensors;
 };
 
 static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -5231,17 +5311,11 @@ static void ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_ten
     const char * ggml_op_name                   = ggml_op_name_string.c_str();
 
     std::string graph_name;
-    ggmlqnn_get_graphkey_from_op(op, graph_name);
+    ggmlqnn_get_opkey_from_op(op, graph_name);
 
-    qnn_perf op_perf                            = qnn_perf(graph_name);
+    qnn_perf op_perf(graph_name);
     op_perf.start();
 
-    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
-        ggmlhexagon_compute(ctx, op);
-        op_perf.info();
-        return;
-    }
-
     bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
     if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
         //retrieve computational resource from cached QNN graph
@@ -5381,7 +5455,7 @@ static void ggmlqnn_compute_mul_mat_4d(ggml_backend_qnn_context * ctx, ggml_tens
     op_perf.start();
 
     std::string graph_name;
-    ggmlqnn_get_graphkey_from_op(op, graph_name);
+    ggmlqnn_get_opkey_from_op(op, graph_name);
     GGMLQNN_LOG_DEBUG("graph name %s\n", graph_name.c_str());
 
     ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
@@ -5623,17 +5697,11 @@ static void ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor
     ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
 
     std::string graph_name;
-    ggmlqnn_get_graphkey_from_op(op, graph_name);
+    ggmlqnn_get_opkey_from_op(op, graph_name);
 
-    qnn_perf op_perf                            = qnn_perf(graph_name);
+    qnn_perf op_perf(graph_name);
     op_perf.start();
 
-    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
-        ggmlhexagon_compute(ctx, op);
-        op_perf.info();
-        return;
-    }
-
     GGML_ASSERT(src0_rank == src1_rank);
     GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
     if (4 == src0_rank) {
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c
index 82de512150bf8..45f6a9b86e426 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c
@@ -1,3 +1,6 @@
+//qidl copyright
+//qidl nested=false
+#include "ggmlop_ap_skel.h"
 #include <string.h>
 #ifndef _WIN32
 #include "HAP_farf.h"
@@ -8,7 +11,6 @@
 
 #include <stdlib.h>
 #include <stdint.h>
-#include "ggmlop_ap_skel.h"
 
 typedef struct _heap _heap;
 struct _heap {
@@ -274,14 +276,14 @@ static const Type types[4];
 static const Type* const typeArrays[6] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[0]),&(types[2])};
 static const StructType structTypes[1] = {{0x6,&(typeArrays[0]),0x30,0x4,0x2c,0x4,0x4,0x4}};
 static const Type types[4] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x10,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x4}}, 8,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[3]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
-static const Parameter parameters[5] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}};
-static const Parameter* const parameterArrays[6] = {(&(parameters[3])),(&(parameters[3])),(&(parameters[4])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))};
-static const Method methods[3] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[3])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[5])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0x64,0x2c,3,3,(&(parameterArrays[0])),0x4,0x4}};
-static const Method* const methodArrays[4] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[2])};
-static const char strings[68] = "mulmat\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0add\0uri\0op\0nb\0ne\0h\0";
-static const uint16_t methodStrings[49] = {0,34,29,62,59,56,7,24,19,29,62,59,56,7,24,44,29,62,59,56,7,24,48,34,29,62,59,56,7,24,19,29,62,59,56,7,24,44,29,62,59,56,7,24,39,52,65,13,65};
-static const uint16_t methodStringsArrays[4] = {44,47,22,0};
-__QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {4,&(methodArrays[0]),0,0,&(methodStringsArrays [0]),methodStrings,strings};
+static const Parameter parameters[6] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4,0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}};
+static const Parameter* const parameterArrays[9] = {(&(parameters[4])),(&(parameters[4])),(&(parameters[5])),(&(parameters[3])),(&(parameters[3])),(&(parameters[3])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))};
+static const Method methods[4] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[6])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[8])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x1,0x0,0x0,0x0),0xc,0x0,3,3,(&(parameterArrays[3])),0x4,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0x64,0x2c,3,3,(&(parameterArrays[0])),0x4,0x4}};
+static const Method* const methodArrays[8] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3])};
+static const char strings[146] = "dsp_setclocks\0dcvs_enable\0power_level\0dsp_mulmat\0dsp_div\0dsp_sub\0dsp_mul\0dsp_add\0latency\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0uri\0op\0nb\0ne\0h\0";
+static const uint16_t methodStrings[119] = {49,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,57,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,65,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,38,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,73,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,0,26,81,14,121,130,143,95,143};
+static const uint16_t methodStringsArrays[8] = {114,117,110,88,66,44,22,0};
+__QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {8,&(methodArrays[0]),0,0,&(methodStringsArrays [0]),methodStrings,strings};
 #endif //_GGMLOP_SLIM_H
 
 
@@ -294,6 +296,25 @@ __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_open)(const char* uri, remote_hand
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_close)(remote_handle64 h) __QAIC_STUB_ATTRIBUTE {
    return __QAIC_REMOTE(remote_handle64_close)(h);
 }
+static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint32_t _in0[1], uint32_t _in1[1], uint32_t _in2[1]) {
+   remote_arg _pra[1] = {0};
+   uint32_t _primIn[3]= {0};
+   int _nErr = 0;
+   _pra[0].buf.pv = (void*)_primIn;
+   _pra[0].buf.nLen = sizeof(_primIn);
+   _COPY(_primIn, 0, _in0, 0, 4);
+   _COPY(_primIn, 4, _in1, 0, 4);
+   _COPY(_primIn, 8, _in2, 0, 4);
+   _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _pra));
+   _CATCH_FARF(_nErr) {
+      _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _mid, __func__);
+   }
+   return _nErr;
+}
+__QAIC_STUB_EXPORT AEEResult __QAIC_STUB(ggmlop_dsp_setclocks)(remote_handle64 _handle, int32 power_level, int32 latency, int32 dcvs_enable) __QAIC_STUB_ATTRIBUTE {
+   uint32_t _mid = 2;
+   return _stub_method(_handle, _mid, (uint32_t*)&power_level, (uint32_t*)&latency, (uint32_t*)&dcvs_enable);
+}
 static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
    int _nErr = 0;
    remote_arg* _praROutPostStart = _praROutPost;
@@ -354,7 +375,7 @@ static __inline void _count_1(int _numIn[1], int _numROut[1], int _numInH[1], in
    _numInH[0] += 0;
    _numROutH[0] += 0;
 }
-static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uintptr_t _in0[SLIM_IFPTR32(13, 8)], uintptr_t _in1[SLIM_IFPTR32(13, 8)], uintptr_t _rout2[SLIM_IFPTR32(13, 8)]) {
+static __inline int _stub_method_1(remote_handle64 _handle, uint32_t _mid, uintptr_t _in0[SLIM_IFPTR32(13, 8)], uintptr_t _in1[SLIM_IFPTR32(13, 8)], uintptr_t _rout2[SLIM_IFPTR32(13, 8)]) {
    remote_arg* _pra = 0;
    int _numIn[1] = {0};
    int _numROut[1] = {0};
@@ -420,10 +441,22 @@ static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uintptr
    return _nErr;
 }
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_add)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
-   uint32_t _mid = 2;
-   return _stub_method(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
+   uint32_t _mid = 3;
+   return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
 }
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_mulmat)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
-   uint32_t _mid = 3;
-   return _stub_method(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
+   uint32_t _mid = 4;
+   return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
+}
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_mul)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
+   uint32_t _mid = 5;
+   return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
+}
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_sub)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
+   uint32_t _mid = 6;
+   return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
+}
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_div)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
+   uint32_t _mid = 7;
+   return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
 }
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h
index 8e05d06f1c2ba..660bab4a15e70 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h
@@ -1,11 +1,13 @@
-#ifndef _GGMLOP_AP_SKEL_H
-#define _GGMLOP_AP_SKEL_H
-
+#ifndef _GGMLOP_H
+#define _GGMLOP_H
+//qidl copyright
+//qidl nested=false
 #include <AEEStdDef.h>
 #include <remote.h>
 #include <string.h>
 #include <stdlib.h>
 
+
 #ifndef __QAIC_HEADER
 #define __QAIC_HEADER(ff) ff
 #endif //__QAIC_HEADER
@@ -276,12 +278,16 @@ __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_open)(const char* uri, remote_
     * @retval, 0 on success, should always succeed
     */
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_close)(remote_handle64 h) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT AEEResult __QAIC_HEADER(ggmlop_dsp_setclocks)(remote_handle64 _h, int32 power_level, int32 latency, int32 dcvs_enable) __QAIC_HEADER_ATTRIBUTE;
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_add)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_mulmat)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_mul)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_sub)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_div)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
 #ifndef ggmlop_URI
 #define ggmlop_URI "file:///libggmlop_skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1"
 #endif /*ggmlop_URI*/
 #ifdef __cplusplus
 }
 #endif
-#endif //_GGMLOP_AP_SKEL_H
+#endif //_GGMLOP_H
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
index 85d0ad8c8e29e..f09a991833d56 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
@@ -26,10 +26,15 @@
 #include <stdint.h>
 #include <assert.h>
 
+#include "HAP_perf.h"
 #include "HAP_farf.h"
+#include "HAP_power.h"
+#include "HAP_vtcm_mgr.h"
 #include "HAP_compute_res.h"
-#include "hexagon_types.h"
+
 #include "AEEStdErr.h"
+#include "hexagon_types.h"
+#include "hexagon_protos.h"
 
 #include "ggmlop_ap_skel.h"
 
@@ -52,7 +57,6 @@
 
 #define MIN(a, b)           ((a) < (b) ? (a) : (b))
 #define MAX(a, b)           ((a) > (b) ? (a) : (b))
-#define SWAP(x, y, T)       do { T SWAP = x; (x) = y; (y) = SWAP; } while (0)
 
 #if UINTPTR_MAX == 0xFFFFFFFF
 #define GGML_MEM_ALIGN      4
@@ -65,19 +69,6 @@
 #define static_assert(a, b) do { } while (0)
 
 typedef double      ggml_float;
-typedef uint16_t    ggml_fp16_t;
-typedef struct { uint16_t bits; } ggml_bf16_t;
-
-static void ggmlhexagon_log_internal(int level, const char * file, const char * func, int line, const char * format, ...);
-
-enum ggmlhexagon_log_level {
-    GGMLHEXAGON_LOG_LEVEL_NONE  = 0,
-    GGMLHEXAGON_LOG_LEVEL_DEBUG = 1,
-    GGMLHEXAGON_LOG_LEVEL_INFO  = 2,
-    GGMLHEXAGON_LOG_LEVEL_WARN  = 3,
-    GGMLHEXAGON_LOG_LEVEL_ERROR = 4,
-    GGMLHEXAGON_LOG_LEVEL_CONT  = 5,
-};
 
 #if 0//def NDEBUG
 #define GGMLQNN_DEBUG                                       0
@@ -86,11 +77,7 @@ enum ggmlhexagon_log_level {
 #endif
 
 #define GGMLHEXAGON_LOGBUF_LEN                              4096
-#define GGMLHEXAGON_TMPBUF_LEN                              256
-
-#define GGMLHEXAGON_LOG_ERROR(...)                          ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLHEXAGON_LOG_WARN(...)                           ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLHEXAGON_LOG_INFO(...)                           ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGML_QNN_TMPBUF_LEN                                 256
 #if GGMLQNN_DEBUG
 #define GGMLHEXAGON_LOG_DEBUG(...)                          ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #else
@@ -98,7 +85,6 @@ enum ggmlhexagon_log_level {
 #endif
 #define GGMLQNN_DUMP_TENSOR(tensor)                         ggmlhexagon_dump_tensor(tensor, #tensor)
 
-
 #define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
     const type prefix##0 = (pointer)->array[0]; \
     GGML_UNUSED(prefix##0);
@@ -135,6 +121,15 @@ enum ggmlhexagon_log_level {
     GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
     GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
 
+enum ggmlhexagon_log_level {
+    GGMLHEXAGON_LOG_LEVEL_NONE  = 0,
+    GGMLHEXAGON_LOG_LEVEL_DEBUG = 1,
+    GGMLHEXAGON_LOG_LEVEL_INFO  = 2,
+    GGMLHEXAGON_LOG_LEVEL_WARN  = 3,
+    GGMLHEXAGON_LOG_LEVEL_ERROR = 4,
+    GGMLHEXAGON_LOG_LEVEL_CONT  = 5,
+};
+
 enum ggml_type {
     GGML_TYPE_F32     = 0,
     GGML_TYPE_F16     = 1,
@@ -178,17 +173,16 @@ enum ggml_type {
     GGML_TYPE_COUNT   = 39,
 };
 
+static size_t ggml_nbytes(const struct ggml_tensor * tensor);
+static void   ggmlhexagon_log_internal(int level, const char * file, const char * func, int line, const char * format, ...);
+static void   ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
 
-static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
-static void ggml_vec_dot_f16(int n, float * GGML_RESTRICT s, size_t bs, ggml_fp16_t * GGML_RESTRICT x, size_t bx, ggml_fp16_t * GGML_RESTRICT y, size_t by, int nrc);
-static void ggml_vec_dot_bf16(int n, float * GGML_RESTRICT s, size_t bs, ggml_bf16_t * GGML_RESTRICT x, size_t bx, ggml_bf16_t * GGML_RESTRICT y, size_t by, int nrc);
-
-typedef void (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
+typedef void  (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
                                  const void * GGML_RESTRICT y, size_t by, int nrc);
-typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
+typedef void  (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
 
-typedef void (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
+typedef void  (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+typedef void  (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
 
 struct ggml_type_traits {
     const char             * type_name;
@@ -229,7 +223,7 @@ static const struct ggml_type_traits type_traits[1] = {
 //  section-2: ggml-hexagon kernel's internal troubleshooting function
 // =================================================================================================
 static void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) {
-    return;
+    //return;
     static char s_ggmlhexagon_log_internal_buf[GGMLHEXAGON_LOGBUF_LEN];
     va_list args;
     va_start(args, format);
@@ -238,26 +232,48 @@ static void ggmlhexagon_log_internal(int level, const char *file, const char *fu
     int len = vsnprintf(s_ggmlhexagon_log_internal_buf + len_prefix,
                         GGMLHEXAGON_LOGBUF_LEN - len_prefix, format, args);
     if (len < (GGMLHEXAGON_LOGBUF_LEN - len_prefix)) {
-
         FARF(ALWAYS, "%s\n", s_ggmlhexagon_log_internal_buf);
     }
     va_end(args);
 }
 
-static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor) {
+static void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) {
+    //return;
+    float value = 0;
+    char tmpbuf[GGMLHEXAGON_LOGBUF_LEN];
+    size_t buflen = 0;
+    if (tensor->type == GGML_TYPE_F32) {
+        memset(tmpbuf, 0, GGMLHEXAGON_LOG_LEVEL_DEBUG);
+        for (int h = 0; h < tensor->ne[3]; h++) {
+            for (int i = 0; i < tensor->ne[2]; i++) {
+                for (int j = 0; j < tensor->ne[1]; j++) {
+                    for (int k = 0; k < tensor->ne[0]; k++) {
+                        value = ((float *) tensor->data)[h * tensor->ne[2] + i * tensor->ne[1] +
+                                                         j * tensor->ne[0] + k];
+                        buflen += snprintf(tmpbuf + buflen, GGMLHEXAGON_LOGBUF_LEN - buflen, "%-4.2f\t", value);
+                    }
+                    buflen += snprintf(tmpbuf + buflen, GGMLHEXAGON_LOGBUF_LEN - buflen, "\n");
+                }
+            }
+        }
+        GGMLHEXAGON_LOG_DEBUG("\n%s\n", tmpbuf);
+    }
+
+    GGMLHEXAGON_LOG_DEBUG("\n");
+}
+
+static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data) {
     GGMLHEXAGON_LOG_DEBUG("ne = %5d x %5d x %5d x %5d , nb = (%5zi, %5zi, %5zi, %5zi)\n",
          tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
          tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[3]);
-}
 
-static void ggml_abort(const char * file, int line, const char * fmt, ...) {
-    GGMLHEXAGON_LOG_DEBUG("enter ggml_abort");
-    //abort();
-    return;
+    if ((1 == dump_tensor_data) && (ggml_nbytes(tensor) < 320)) {
+        ggmlhexagon_dump_tensor_elements(tensor);
+    }
 }
 
 // =================================================================================================
-//  section-3: ggml-hexagon kernel's helper function(tiny ggml-dsp, ported from original ggml)
+//  section-3: tiny ggml-dsp(ggml on Hexagon cDSP, ported from original ggml)
 // =================================================================================================
 static const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
     return &type_traits_cpu[type];
@@ -277,6 +293,18 @@ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const fl
     *s = sumf;
 }
 
+inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) {
+    for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];
+}
+
+inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) {
+    for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];
+}
+
+inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) {
+    for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i];
+}
+
 static const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
     return &type_traits[type];
 }
@@ -401,6 +429,15 @@ inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, co
     for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i];
 }
 
+static void ggml_abort(const char * file, int line, const char * fmt, ...) {
+    GGMLHEXAGON_LOG_DEBUG("enter ggml_abort");
+    abort();
+    return;
+}
+
+// =================================================================================================
+//  section-4: ggml-hexagon kernel helper function
+// =================================================================================================
 int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
     void *tptr = NULL;
     FARF(HIGH, "uri %s", uri);
@@ -416,16 +453,67 @@ int ggmlop_dsp_close(remote_handle64 handle) {
     return 0;
 }
 
+AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    HAP_power_request_t request;
+    memset(&request, 0, sizeof(HAP_power_request_t));
+    request.type = HAP_power_set_apptype;
+    request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
+
+    void * ggmop_ctx = (void*)(handle);
+    int retval = HAP_power_set(ggmop_ctx, &request);
+    if (retval)  {
+        GGMLHEXAGON_LOG_DEBUG("failed first power vote");
+        return AEE_EFAILED;
+    }
+
+    //configure clocks & DCVS mode
+    memset(&request, 0, sizeof(HAP_power_request_t));
+    request.type = HAP_power_set_DCVS_v2;
+    request.dcvs_v2.dcvs_enable = TRUE;
+    request.dcvs_v2.dcvs_params.target_corner = (HAP_dcvs_voltage_corner_t)power_level;
+    if (dcvs_enabled) {
+        request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_DISABLE;
+        request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_DISABLE;
+    } else {
+        request.dcvs_v2.dcvs_params.min_corner = request.dcvs_v2.dcvs_params.target_corner;
+        request.dcvs_v2.dcvs_params.max_corner = request.dcvs_v2.dcvs_params.target_corner;
+    }
+    request.dcvs_v2.dcvs_option     = HAP_DCVS_V2_PERFORMANCE_MODE;
+    request.dcvs_v2.set_dcvs_params = TRUE;
+    request.dcvs_v2.set_latency     = TRUE;
+    request.dcvs_v2.latency         = latency;
+    retval = HAP_power_set(ggmop_ctx, &request);
+    if (retval) {
+        GGMLHEXAGON_LOG_DEBUG("failed to vote for performance mode");
+        return AEE_EFAILED;
+    }
+
+    memset(&request, 0, sizeof(HAP_power_request_t));
+    request.type = HAP_power_set_HVX;
+    request.hvx.power_up = TRUE;
+    retval = HAP_power_set(ggmop_ctx, &request);
+    if (retval) {
+        GGMLHEXAGON_LOG_DEBUG("failed to vote for HVX power");
+        return AEE_EFAILED;
+    }
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return AEE_SUCCESS;
+}
+
 // =================================================================================================
-//  section-4: ggml-hexagon kernel function
+//  section-5: ggml-hexagon kernel function: offload ggmlop to cDSP through Hexagon C API and SIMD instructions
 // =================================================================================================
 static void ggml_compute_forward_add_f32(
-        struct ggml_tensor * src0,
-        struct ggml_tensor * src1,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
     memcpy(dst->ne, src1->ne, 16);
     memcpy(dst->nb, src1->nb, 16);
+    ggmlhexagon_dump_tensor(src0, 1);
+    ggmlhexagon_dump_tensor(src1, 1);
+    ggmlhexagon_dump_tensor(dst, 1);
 
     GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
 
@@ -458,9 +546,17 @@ static void ggml_compute_forward_add_f32(
             float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
             for (int64_t r = 0; r < nr0; ++r) {
 #ifdef GGML_USE_ACCELERATE
-                vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
+                //vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
+                HVX_Vector *va = (HVX_Vector *) src1_ptr;
+                HVX_Vector *vb = (HVX_Vector *) src0_ptr + r * ne10;
+                HVX_Vector *vc = (HVX_Vector *) dst_ptr + r * ne10;
+                int total_vectors = ne10 / FLOATS_PER_VECTOR;
+                GGMLHEXAGON_LOG_DEBUG("total_vectors %d", total_vectors);
+                for (int i = 0; i < total_vectors; ++i) {
+                    *vc++ = Q6_Vqf32_vadd_Vqf32Vqf32(*va++, *vb++);
+                }
 #else
-                ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+        ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
 #endif
             }
         }
@@ -492,7 +588,7 @@ static void ggml_compute_forward_add_f32(
 
 int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst)
 {
-    GGMLHEXAGON_LOG_DEBUG("enter ggmlop_dsp_add\n");
+    GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
     switch (src0->type) {
         case GGML_TYPE_F32:
         {
@@ -508,13 +604,13 @@ int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tenso
             GGML_ABORT("fatal error");
         }
     }
-    GGMLHEXAGON_LOG_DEBUG("leave ggmlop_dsp_add\n");
+    GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__);
     return 0;
 }
 
 static void ggml_compute_forward_mul_mat_one_chunk(
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
+        const ggml_tensor * src0,
+        const ggml_tensor * src1,
         struct ggml_tensor * dst,
         const enum ggml_type type,
         const int32_t num_rows_per_vec_dot,
@@ -522,9 +618,9 @@ static void ggml_compute_forward_mul_mat_one_chunk(
         const int32_t ir0_end,
         const int32_t ir1_start,
         const int32_t ir1_end) {
-    ggmlhexagon_dump_tensor(src0);
-    ggmlhexagon_dump_tensor(src1);
-    ggmlhexagon_dump_tensor(dst);
+    ggmlhexagon_dump_tensor(src0, 0);
+    ggmlhexagon_dump_tensor(src1, 0);
+    ggmlhexagon_dump_tensor(dst, 0);
 
     dst->ne[0] = src0->ne[1];
     dst->ne[1] = src1->ne[1];
@@ -535,7 +631,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
     dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type));
     dst->nb[2] = dst->nb[1] * dst->ne[1];
     dst->nb[3] = dst->nb[2] * dst->ne[2];
-    ggmlhexagon_dump_tensor(dst);
+    ggmlhexagon_dump_tensor(dst, 0);
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
@@ -613,9 +709,10 @@ static void ggml_compute_forward_mul_mat_one_chunk(
 }
 
  int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-     ggmlhexagon_dump_tensor(src0);
-     ggmlhexagon_dump_tensor(src1);
-     ggmlhexagon_dump_tensor(dst);
+     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+     ggmlhexagon_dump_tensor(src0, 0);
+     ggmlhexagon_dump_tensor(src1, 0);
+     ggmlhexagon_dump_tensor(dst, 0);
 
      dst->ne[0] = src0->ne[1];
      dst->ne[1] = src1->ne[1];
@@ -626,7 +723,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
      dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type));
      dst->nb[2] = dst->nb[1] * dst->ne[1];
      dst->nb[3] = dst->nb[2] * dst->ne[2];
-     ggmlhexagon_dump_tensor(dst);
+     ggmlhexagon_dump_tensor(dst, 0);
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
@@ -735,6 +832,276 @@ static void ggml_compute_forward_mul_mat_one_chunk(
         }
         current_chunk++;
     }
+     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+static void ggml_compute_forward_sub_f32(
+        const ggml_tensor * src0,
+        const ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+
+    memcpy(dst->ne, src1->ne, 16);
+    memcpy(dst->nb, src1->nb, 16);
+
+    assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    const int ith = 0;
+    const int nth = 1;
+
+    const int nr  = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
 
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    if (nb10 == sizeof(float)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                vDSP_vsub(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                ggml_vec_sub_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne0; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] - *src1_ptr;
+            }
+        }
+    }
+}
+int ggmlop_dsp_sub(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+        {
+            ggml_compute_forward_sub_f32(src0, src1, dst);
+        } break;
+        default:
+        {
+            GGML_ABORT("fatal error");
+        }
+    }
     return 0;
 }
+
+static void ggml_compute_forward_mul_f32(
+        const ggml_tensor * src0,
+        const ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    memcpy(dst->ne, src1->ne, 16);
+    memcpy(dst->nb, src1->nb, 16);
+
+
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    const int ith = 0;
+    const int nth = 1;
+
+    const int64_t nr = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    if (nb10 == sizeof(float)) {
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+            for (int64_t r = 0 ; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                UNUSED(ggml_vec_mul_f32);
+
+                vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne00; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
+            }
+        }
+    }
+}
+
+int ggmlop_dsp_mul(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+        {
+            if (src1->type == GGML_TYPE_F32) {
+                ggml_compute_forward_mul_f32(src0, src1, dst);
+            } else {
+                GGML_ABORT("fatal error");
+            }
+            break;
+        }
+        default:
+        {
+            GGML_ABORT("fatal error");
+        }
+    }
+    GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__);
+    return 0;
+}
+static void ggml_compute_forward_div_f32(
+        const ggml_tensor * src0,
+        const ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+
+    memcpy(dst->ne, src1->ne, 16);
+    memcpy(dst->nb, src1->nb, 16);
+
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    const int ith = 0;
+    const int nth = 1;
+
+    const int64_t nr = ggml_nrows(src0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    if (nb10 == sizeof(float)) {
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+
+            for (int64_t r = 0; r < nr0; ++r) {
+#ifdef GGML_USE_ACCELERATE
+                UNUSED(ggml_vec_div_f32);
+
+                vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
+#else
+                ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+#endif
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int64_t ir = ith; ir < nr; ir += nth) {
+            // src0 and dst are same shape => same indices
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int64_t i0 = 0; i0 < ne00; ++i0) {
+                const int64_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
+            }
+        }
+    }
+}
+
+int ggmlop_dsp_div(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+        {
+            ggml_compute_forward_div_f32(src0, src1, dst);
+        } break;
+
+        default:
+        {
+            GGML_ABORT("fatal error");
+        }
+    }
+}
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c
index 58bf1a846742f..f2660e0a518d1 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c
@@ -1,3 +1,7 @@
+//qidl copyright
+//qidl nested=false
+#include "ggmlop_ap_skel.h"
+
 #include <string.h>
 #ifndef _WIN32
 #include "HAP_farf.h"
@@ -7,7 +11,6 @@
 
 #include <stdlib.h>
 #include <stdint.h>
-#include "ggmlop_ap_skel.h"
 
 typedef struct _heap _heap;
 struct _heap {
@@ -273,14 +276,14 @@ static const Type types[4];
 static const Type* const typeArrays[6] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[0]),&(types[2])};
 static const StructType structTypes[1] = {{0x6,&(typeArrays[0]),0x30,0x4,0x2c,0x4,0x4,0x4}};
 static const Type types[4] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x10,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x4}}, 8,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[3]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
-static const Parameter parameters[5] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}};
-static const Parameter* const parameterArrays[6] = {(&(parameters[3])),(&(parameters[3])),(&(parameters[4])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))};
-static const Method methods[3] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[3])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[5])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0x64,0x2c,3,3,(&(parameterArrays[0])),0x4,0x4}};
-static const Method* const methodArrays[4] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[2])};
-static const char strings[68] = "mulmat\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0add\0uri\0op\0nb\0ne\0h\0";
-static const uint16_t methodStrings[49] = {0,34,29,62,59,56,7,24,19,29,62,59,56,7,24,44,29,62,59,56,7,24,48,34,29,62,59,56,7,24,19,29,62,59,56,7,24,44,29,62,59,56,7,24,39,52,65,13,65};
-static const uint16_t methodStringsArrays[4] = {44,47,22,0};
-__QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {4,&(methodArrays[0]),0,0,&(methodStringsArrays [0]),methodStrings,strings};
+static const Parameter parameters[6] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4,0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}};
+static const Parameter* const parameterArrays[9] = {(&(parameters[4])),(&(parameters[4])),(&(parameters[5])),(&(parameters[3])),(&(parameters[3])),(&(parameters[3])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))};
+static const Method methods[4] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[6])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[8])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x1,0x0,0x0,0x0),0xc,0x0,3,3,(&(parameterArrays[3])),0x4,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0x64,0x2c,3,3,(&(parameterArrays[0])),0x4,0x4}};
+static const Method* const methodArrays[8] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3])};
+static const char strings[146] = "dsp_setclocks\0dcvs_enable\0power_level\0dsp_mulmat\0dsp_div\0dsp_sub\0dsp_mul\0dsp_add\0latency\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0uri\0op\0nb\0ne\0h\0";
+static const uint16_t methodStrings[119] = {49,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,57,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,65,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,38,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,73,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,0,26,81,14,121,130,143,95,143};
+static const uint16_t methodStringsArrays[8] = {114,117,110,88,66,44,22,0};
+__QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {8,&(methodArrays[0]),0,0,&(methodStringsArrays [0]),methodStrings,strings};
 #endif //_GGMLOP_SLIM_H
 extern int adsp_mmap_fd_getinfo(int, uint32_t *);
 #ifdef __cplusplus
@@ -392,7 +395,29 @@ static __inline int _skel_method(int (*_pfn)(remote_handle64, const dsptensor*,
    _allocator_deinit(_al);
    return _nErr;
 }
-static __inline int _skel_method_1(int (*_pfn)(remote_handle64), uint32_t _sc, remote_arg* _pra) {
+static __inline int _skel_method_1(int (*_pfn)(remote_handle64, int32, int32, int32), remote_handle64 _h, uint32_t _sc, remote_arg* _pra) {
+   remote_arg* _praEnd = 0;
+   uint32_t _in0[1] = {0};
+   uint32_t _in1[1] = {0};
+   uint32_t _in2[1] = {0};
+   uint32_t* _primIn= 0;
+   int _nErr = 0;
+   _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc));
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INBUFS(_sc)==1);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTBUFS(_sc)==0);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_INHANDLES(_sc)==0);
+   _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==0);
+   _QAIC_ASSERT(_nErr, (_pra + ((1 + 0) + (((0 + 0) + 0) + 0))) <= _praEnd);
+   _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 12);
+   _primIn = _pra[0].buf.pv;
+   _COPY(_in0, 0, _primIn, 0, 4);
+   _COPY(_in1, 0, _primIn, 4, 4);
+   _COPY(_in2, 0, _primIn, 8, 4);
+   _TRY(_nErr, _pfn(_h, (int32)*_in0, (int32)*_in1, (int32)*_in2));
+   _QAIC_CATCH(_nErr) {}
+   return _nErr;
+}
+static __inline int _skel_method_2(int (*_pfn)(remote_handle64), uint32_t _sc, remote_arg* _pra) {
    remote_arg* _praEnd = 0;
    remote_handle64 _in0[1] = {0};
    remote_arg* _praRHandleIn = _pra + REMOTE_SCALARS_INBUFS(_sc) +  REMOTE_SCALARS_OUTBUFS(_sc);
@@ -536,7 +561,7 @@ static __inline int _stub_skel_version_check(char*_in0, int* resVal) {
    _QAIC_CATCH(_nErr) {}
    return 0;
 }
-static __inline int _skel_method_2(int (*_pfn)(const char*, remote_handle64*), uint32_t _sc, remote_arg* _pra) {
+static __inline int _skel_method_3(int (*_pfn)(const char*, remote_handle64*), uint32_t _sc, remote_arg* _pra) {
    remote_arg* _praEnd = 0;
    char* _in0[1] = {0};
    uint32_t _in0Len[1] = {0};
@@ -572,13 +597,21 @@ static __inline int _skel_method_2(int (*_pfn)(const char*, remote_handle64*), u
 __QAIC_SKEL_EXPORT int __QAIC_SKEL(ggmlop_skel_handle_invoke)(remote_handle64 _h, uint32_t _sc, remote_arg* _pra) __QAIC_SKEL_ATTRIBUTE {
    switch(REMOTE_SCALARS_METHOD(_sc)){
       case 0:
-      return _skel_method_2(__QAIC_IMPL(ggmlop_dsp_open), _sc, _pra);
+      return _skel_method_3(__QAIC_IMPL(ggmlop_dsp_open), _sc, _pra);
       case 1:
-      return _skel_method_1(__QAIC_IMPL(ggmlop_dsp_close), _sc, _pra);
+      return _skel_method_2(__QAIC_IMPL(ggmlop_dsp_close), _sc, _pra);
       case 2:
-      return _skel_method(__QAIC_IMPL(ggmlop_dsp_add), _h, _sc, _pra);
+      return _skel_method_1(__QAIC_IMPL(ggmlop_dsp_setclocks), _h, _sc, _pra);
       case 3:
+      return _skel_method(__QAIC_IMPL(ggmlop_dsp_add), _h, _sc, _pra);
+      case 4:
       return _skel_method(__QAIC_IMPL(ggmlop_dsp_mulmat), _h, _sc, _pra);
+      case 5:
+      return _skel_method(__QAIC_IMPL(ggmlop_dsp_mul), _h, _sc, _pra);
+      case 6:
+      return _skel_method(__QAIC_IMPL(ggmlop_dsp_sub), _h, _sc, _pra);
+      case 7:
+      return _skel_method(__QAIC_IMPL(ggmlop_dsp_div), _h, _sc, _pra);
    }
    return AEE_EUNSUPPORTED;
 }
diff --git a/scripts/ggml-qnn.cfg b/scripts/ggml-qnn.cfg
index 17bd8f6a4b1ca..7689e1dced161 100644
--- a/scripts/ggml-qnn.cfg
+++ b/scripts/ggml-qnn.cfg
@@ -32,6 +32,6 @@ precision_mode = "fp16"
 #hwaccel approach through cDSP
 [cdsp]
 #enable/disable offload mulmat to cDSP
-enable_mulmat_cdsp = 0
+enable_mulmat_cdsp = 1
 #enable/disable offload fp32 & all quantized type mulmat to cDSP
 enable_q_mulmat = 0

From 105e1cd82202b6c8355db70e72d6762e4cc841ac Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Fri, 28 Mar 2025 09:10:27 +0800
Subject: [PATCH 139/200] ggml-hexagon: code on AP(arm-cpu) side is stable now

---
 ggml/src/ggml-qnn/ggml-qnn.cpp | 83 +++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 42 deletions(-)

diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-qnn/ggml-qnn.cpp
index dcc332042bdc1..b89b283552acf 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-qnn/ggml-qnn.cpp
@@ -158,6 +158,7 @@ static void   ggmlqnn_compute_timestep_embedding(ggml_backend_qnn_context * ctx,
 static void   ggmlqnn_compute_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value);
 
 static size_t ggmlqnn_get_op_index(const ggml_tensor * tensor);
+static const char * ggmlqnn_get_hwaccel_approach_name(int hwaccle_approach);
 static void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op);
 static void   ggmlqnn_get_opkey_from_op(const ggml_tensor * op, std::string & output);
 
@@ -875,6 +876,33 @@ static void ggmlqnn_dump_tensor(const ggml_tensor * tensor, const char * name) {
     GGMLQNN_LOG_DEBUG("\n");
 }
 
+static void ggmlqnn_get_timestring(char * p_currenttime) {
+    time_t n_seconds    = 0;
+    struct tm * p_tm    = nullptr;
+
+    if (nullptr == p_currenttime)
+        return;
+
+    time(&n_seconds);
+    p_tm = localtime(&n_seconds);
+    snprintf(p_currenttime, GGML_QNN_TMPBUF_LEN, "%04d-%02d-%02d,%02d:%02d:%02d",
+             p_tm->tm_year + 1900, p_tm->tm_mon + 1, p_tm->tm_mday,
+             p_tm->tm_hour, p_tm->tm_min, p_tm->tm_sec);
+}
+
+static void ggmlqnn_print_running_timestamp(ggml_backend_qnn_context * ctx) {
+    GGMLQNN_LOG_INFO("hwaccel approach is %d(%s)", g_qnn_params.hwaccel_approach,
+                     ggmlqnn_get_hwaccel_approach_name(g_qnn_params.hwaccel_approach));
+    char timestamp[GGML_QNN_TMPBUF_LEN];
+    ggmlqnn_get_timestring(timestamp);
+    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
+        GGMLQNN_LOG_INFO("only offload GGML_OP_ADD : %s", g_qnn_params.enable_q_mulmat ? "NO" : "YES");
+    } else {
+        GGMLQNN_LOG_INFO("only offload GGML_OP_ADD: NO");
+    }
+    GGMLQNN_LOG_INFO("running timestamp:%s", timestamp);
+}
+
 class qnn_perf {
 public:
     qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
@@ -1065,20 +1093,6 @@ static char * ggmlqnn_strndup(const char * source, size_t maxlen) {
 #endif
 }
 
-static void ggmlqnn_get_timestring(char * p_currenttime) {
-    time_t n_seconds    = 0;
-    struct tm * p_tm    = nullptr;
-
-    if (nullptr == p_currenttime)
-        return;
-
-    time(&n_seconds);
-    p_tm = localtime(&n_seconds);
-    snprintf(p_currenttime, GGML_QNN_TMPBUF_LEN, "%04d-%02d-%02d-%02d-%02d-%02d",
-             p_tm->tm_year + 1900, p_tm->tm_mon + 1, p_tm->tm_mday,
-             p_tm->tm_hour, p_tm->tm_min, p_tm->tm_sec);
-}
-
 // =================================================================================================
 //  section-5: QNN helper function
 // =================================================================================================
@@ -2123,7 +2137,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
 
 static void ggmlhexagon_close_cdsp(ggml_backend_qnn_context * ctx) {
     int hexagon_error  = AEE_SUCCESS;
-    GGMLQNN_LOG_DEBUG("enter %s", __func__);
+    GGMLQNN_LOG_INFO("enter %s", __func__);
     if (-1 != ctx->ggmlop_handle) {
         hexagon_error = ggmlop_dsp_close(ctx->ggmlop_handle);
         if (AEE_SUCCESS != hexagon_error) {
@@ -2139,7 +2153,7 @@ static void ggmlhexagon_close_cdsp(ggml_backend_qnn_context * ctx) {
         ctx->rpc_mempool_len    = 0;
         ctx->domain_id          = -1;
     }
-    GGMLQNN_LOG_DEBUG("leave %s", __func__);
+    GGMLQNN_LOG_INFO("leave %s", __func__);
 }
 
 static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tensor * op) {
@@ -3693,7 +3707,7 @@ int qnn_instance::qnn_finalize() {
     int ret_status = 0;
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
-    GGMLQNN_LOG_DEBUG("enter %s\n", __func__);
+    GGMLQNN_LOG_INFO("enter %s\n", __func__);
     ggmlqnn_reset_idx();
 
     free_rpcmem();
@@ -3760,7 +3774,7 @@ int qnn_instance::qnn_finalize() {
     unload_backend();
     unload_system();
 
-    GGMLQNN_LOG_DEBUG("leave %s\n", __func__);
+    GGMLQNN_LOG_INFO("leave %s\n", __func__);
     return ret_status;
 }
 
@@ -4149,20 +4163,13 @@ static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * p
     return opcfg;
 }
 
-static Qnn_Tensor_t *   ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
-                                                      const ggml_tensor * tensor, const char * name,
-                                                      Qnn_TensorType_t qnn_tensor_type,
-                                                      Qnn_DataType_t qnn_data_type,
-                                                      uint32_t rank, uint32_t * dims,
-                                                      void * data, uint32_t data_size,
-                                                      bool b_transpose = false);
 static Qnn_Tensor_t * ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
                                                     const ggml_tensor * tensor, const char * name,
                                                     Qnn_TensorType_t qnn_tensor_type,
                                                     Qnn_DataType_t qnn_data_type,
                                                     uint32_t rank, uint32_t * dims,
                                                     void * data, uint32_t data_size,
-                                                    bool b_transpose) {
+                                                    bool b_transpose = false) {
     Qnn_ErrorHandle_t error         = QNN_SUCCESS;
     char tensor_name[GGML_MAX_NAME] = {};
 
@@ -4196,14 +4203,6 @@ static Qnn_Tensor_t * ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn
 
         ggmlqnn_get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_n_dims(tensor));
         tensor_dims = transpose_dims;
-#if 0
-        for (size_t idx = 0; idx < 4; idx++) {
-            GGMLQNN_LOG_DEBUG("origin dim[%d]=%d\n", idx, reverse_dims[idx]);
-        }
-        for (size_t idx = 0; idx < 4; idx++) {
-            GGMLQNN_LOG_DEBUG("trans  dim[%d]=%d\n", idx, transpose_dims[idx]);
-        }
-#endif
     }
 
     Qnn_Tensor_t qnn_tensor = {
@@ -4763,11 +4762,6 @@ static const char * ggml_backend_qnn_name(ggml_backend_t backend) {
 static void ggml_backend_qnn_free(ggml_backend_t backend) {
     GGMLQNN_LOG_DEBUG("enter %s", __func__ );
     ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *)backend->context;
-    GGMLQNN_LOG_DEBUG("device idx %d, name:%s", ctx->device, g_qnn_mgr[ctx->device].name);
-
-    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
-        ggmlhexagon_close_cdsp(ctx);
-    }
 
     qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance;
     if (instance != nullptr) {
@@ -4791,8 +4785,13 @@ static void ggml_backend_qnn_free(ggml_backend_t backend) {
     }
 
     if (g_qnn_mgr[ctx->device].backend != nullptr) {
+        if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
+            ggmlhexagon_close_cdsp(ctx);
+        }
+
         delete backend;
         g_qnn_mgr[ctx->device].backend = nullptr;
+        ggmlqnn_print_running_timestamp(ctx);
     }
     GGMLQNN_LOG_DEBUG("leave %s", __func__ );
 }
@@ -5155,7 +5154,7 @@ const char * ggml_backend_qnn_get_devname(size_t dev_num) {
 
 static qnn_instance * ggmlqnn_init_qnn_instance(size_t device, const char * qnn_lib_path) {
     int result = 0;
-    GGMLQNN_LOG_INFO("inference approach=%d(%s)", g_qnn_params.hwaccel_approach,
+    GGMLQNN_LOG_INFO("hwaccel approach=%d(%s)", g_qnn_params.hwaccel_approach,
                      ggmlqnn_get_hwaccel_approach_name(g_qnn_params.hwaccel_approach));
 
     qnn_instance * instance = nullptr;
@@ -5219,7 +5218,7 @@ ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
         return g_qnn_mgr[device].backend;
     }
 
-    //don't initialize QNN when inference approach is offload ggml op to Hexagon cDSP directly
+    //don't initialize QNN when hwaccel approach is offload ggml op to Hexagon cDSP directly
     if (HWACCEL_CDSP != g_qnn_params.hwaccel_approach) {
         qnn_instance * instance = ggmlqnn_init_qnn_instance(device, qnn_lib_path);
         if (nullptr == instance)
@@ -5287,7 +5286,7 @@ static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const
 }
 
 /*
- * provide a general skeleton to offload ggml op to QNN backend or Hexagon cDSP: perform element-wise
+ * provide a general skeleton to offload ggml op to QNN backend: perform element-wise
  * operation on 1/2 input tensors and 1 output tensors
 */
 static void ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_tensor * op) {

From 03ae20fc1d091e28f11e8ede5bc57679edc21514 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Fri, 28 Mar 2025 21:45:30 +0800
Subject: [PATCH 140/200] ggml-hexagon: optimize GGML_OP_ADD on cDSP side

---
 ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c | 58 +++++++++++++++++++------
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
index f09a991833d56..410422f2f6d1e 100644
--- a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
+++ b/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
@@ -45,6 +45,8 @@
 
 #define GGML_MAX_DIMS       4
 
+#define ALIGN_128_BYTE      128
+
 #define GGML_UNUSED(x)      (void)(x)
 
 #define UNUSED              GGML_UNUSED
@@ -223,7 +225,7 @@ static const struct ggml_type_traits type_traits[1] = {
 //  section-2: ggml-hexagon kernel's internal troubleshooting function
 // =================================================================================================
 static void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) {
-    //return;
+    return;
     static char s_ggmlhexagon_log_internal_buf[GGMLHEXAGON_LOGBUF_LEN];
     va_list args;
     va_start(args, format);
@@ -504,6 +506,46 @@ AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32
 // =================================================================================================
 //  section-5: ggml-hexagon kernel function: offload ggmlop to cDSP through Hexagon C API and SIMD instructions
 // =================================================================================================
+inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float * x, const float * y) {
+    HVX_Vector * va;
+    HVX_Vector * vb;
+    HVX_Vector * vc;
+    HVX_Vector qf32;
+    const int FLOATS_PER_VECTOR = 128 / sizeof(float);
+    const int block  = n / FLOATS_PER_VECTOR;
+    const int left   = n % FLOATS_PER_VECTOR;
+    const int blocks = block * FLOATS_PER_VECTOR;
+
+    if (0 == block) {
+        for (size_t i = 0; i < n; ++i)
+            z[i] = x[i] + y[i];
+
+        return;
+    }
+
+    if ((((uintptr_t)z | (uintptr_t)x | (uintptr_t)y) % ALIGN_128_BYTE) != 0) {
+        GGMLHEXAGON_LOG_DEBUG("memaddress mismatch alignment 128 bytes z:%p x:%p y:%p", z, x, y);
+        for (size_t i = 0; i < n; ++i)
+            z[i] = x[i] + y[i];
+
+        return;
+    }
+
+    va = (HVX_Vector *)x;
+    vb = (HVX_Vector *)y;
+    vc = (HVX_Vector *)z;
+    for (size_t i = 0; i < block; ++i) {
+        qf32 = Q6_Vqf32_vadd_VsfVsf(*va++, *vb++);
+        *vc = Q6_Vsf_equals_Vqf32(qf32);
+        vc++;
+    }
+
+    if (left > 0) {
+        for (size_t i = 0; i < left; ++i)
+            z[i + blocks] = x[i + blocks] + y[i + blocks];
+    }
+}
+
 static void ggml_compute_forward_add_f32(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
@@ -545,19 +587,7 @@ static void ggml_compute_forward_add_f32(
             float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
             float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
             for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                //vDSP_vadd(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
-                HVX_Vector *va = (HVX_Vector *) src1_ptr;
-                HVX_Vector *vb = (HVX_Vector *) src0_ptr + r * ne10;
-                HVX_Vector *vc = (HVX_Vector *) dst_ptr + r * ne10;
-                int total_vectors = ne10 / FLOATS_PER_VECTOR;
-                GGMLHEXAGON_LOG_DEBUG("total_vectors %d", total_vectors);
-                for (int i = 0; i < total_vectors; ++i) {
-                    *vc++ = Q6_Vqf32_vadd_Vqf32Vqf32(*va++, *vb++);
-                }
-#else
-        ggml_vec_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
+                ggmlhexagon_dsp_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
             }
         }
     } else {

From ba05e0494f9dbaebe6b4e9700b04d4111159c770 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sat, 29 Mar 2025 12:09:15 +0800
Subject: [PATCH 141/200] ggml-hexagon: simplify hexagon-kernel build logic in
 CMakeLists.txt

---
 ggml/src/ggml-qnn/CMakeLists.txt   | 24 +++++++++++++++++++++---
 ggml/src/ggml-qnn/kernels/Makefile |  9 +++++----
 scripts/build-run-android.sh       | 24 ++++++++++--------------
 3 files changed, 36 insertions(+), 21 deletions(-)

diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-qnn/CMakeLists.txt
index 4fb3a8b6d4b47..ed6fb2fd85608 100644
--- a/ggml/src/ggml-qnn/CMakeLists.txt
+++ b/ggml/src/ggml-qnn/CMakeLists.txt
@@ -1,4 +1,5 @@
-message(STATUS "Using QNN backend")
+project(ggml-qnn)
+message(STATUS "Using HEXAGON backend")
 message("CMAKE_SYSTEM_NAME : ${CMAKE_SYSTEM_NAME}")
 
 if(NOT DEFINED QNN_SDK_PATH)
@@ -9,8 +10,9 @@ if(NOT DEFINED HEXAGON_SDK_PATH)
     message(FATAL_ERROR "HEXAGON_SDK_PATH not defined")
 endif()
 
-message("QNN_SDK_PATH:     ${QNN_SDK_PATH}")
+message("QNN_SDK_PATH    : ${QNN_SDK_PATH}")
 message("HEXAGON_SDK_PATH: ${HEXAGON_SDK_PATH}")
+message("HTP_ARCH_VERSION: ${HTP_ARCH_VERSION}")
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     find_library(LOG_LIB log)
@@ -37,7 +39,6 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     include_directories(${HEXAGON_SDK_PATH}/libs/atomic/android_Debug_aarch64/ship)
     include_directories(${CMAKE_SOURCE_DIR}/ggml/src/ggml-qnn/)
     include_directories(${CMAKE_SOURCE_DIR}/ggml/src/ggml-qnn/kernels/)
-
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
     set(QNN_DEFAULT_LIB_SEARCH_PATH "C:\\" CACHE STRING "customized library search path for QNN backend")
 else()
@@ -55,3 +56,20 @@ target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES})
 
 string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
 target_compile_definitions(ggml-qnn PRIVATE QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/")
+
+function(ggml_hexagon_build_kernel KNAME)
+    message(STATUS "ggml_hexagon: build kernel ${KNAME}")
+
+    add_custom_command(
+        TARGET ${PROJECT_NAME}
+        POST_BUILD
+        COMMAND echo "current working path:`pwd`\n"
+        COMMAND echo "${CMAKE_CURRENT_LIST_DIR}/kernels"
+        COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ clean
+        COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ HEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} HTP_ARCH_VERSION=${HTP_ARCH_VERSION}
+        COMMAND echo "`pwd`"
+        COMMENT "build hexagon-kernel"
+    )
+endfunction()
+
+ggml_hexagon_build_kernel("cdsp")
diff --git a/ggml/src/ggml-qnn/kernels/Makefile b/ggml/src/ggml-qnn/kernels/Makefile
index 879bd3444ee1b..0a58333a5f895 100755
--- a/ggml/src/ggml-qnn/kernels/Makefile
+++ b/ggml/src/ggml-qnn/kernels/Makefile
@@ -1,12 +1,13 @@
-HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
+TARGET=libggmlop_skel${HTP_ARCH_VERSION}.so
 
-TARGET=libggmlop_skel.so
+$(info HEXAGON_SDK_PATH:${HEXAGON_SDK_PATH})
+$(info HTP_ARCH_VERSION:${HTP_ARCH_VERSION})
 
 INCS=-I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc
 
-CFLAGS=-mv75 -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B ${INCS}
+CFLAGS=-m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B ${INCS}
 
-LDFLAGS=-mv75 -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${TARGET}
+LDFLAGS=-m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${TARGET}
 
 SRCS = ggmlop_cdsp.c  ggmlop_cdsp_skel.c
 OBJS = $(patsubst %.c, %.o, $(SRCS))
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 2e4143b5acca3..ecce07af250cd 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -16,7 +16,15 @@ QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direc
 QNN_SDK_INSTALL_PATH=/opt/qcom/aistack/qairt/
 QNN_SDK_VERSION=2.32.0.250228
 QNN_SDK_PATH=${QNN_SDK_INSTALL_PATH}/${QNN_SDK_VERSION}
+
 HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
+#available htp arch version:
+#v68 --- Snapdragon 888
+#v69 --- Snapdragon 8 Gen1
+#v73 --- Snapdragon 8 Gen2
+#v75 --- Snapdragon 8 Gen3
+#v79 --- Snapdragon 8 Elite(aka Gen4)
+HTP_ARCH_VERSION=v75
 
 qnnparams=" -mg 2 -ngl 99 "
 
@@ -107,26 +115,14 @@ function check_and_download_ndk()
 }
 
 
-function build_dsp
-{
-    cd ggml/src/ggml-qnn/kernels/
-    show_pwd
-    make clean
-    make
-    cd -
-}
-
-
 function build_arm64
 {
-    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH}
+    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION}
     cd out/android
     make -j16
     show_pwd
 
     cd -
-
-    build_dsp
 }
 
 
@@ -201,7 +197,7 @@ function prepare_run_on_phone()
     fi
     adb push ./out/android/bin/${program} ${REMOTE_PATH}/
     adb shell chmod +x ${REMOTE_PATH}/${program}
-    adb push ggml/src/ggml-qnn/kernels/libggmlop_skel.so  ${REMOTE_PATH}/
+    adb push ggml/src/ggml-qnn/kernels/libggmlop_skel${HTP_ARCH_VERSION}.so  ${REMOTE_PATH}/libggmlop_skel.so
 }
 
 function run_llamacli()

From 19eb56db7c661f15ed56bd5b3623d2f140934e30 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sat, 29 Mar 2025 16:12:18 +0800
Subject: [PATCH 142/200] ggml-hexagon: release ggml-hexagon v0.98

---
 CMakeLists.txt                                |    2 +-
 ggml/CMakeLists.txt                           |    4 +-
 ggml/include/{ggml-qnn.h => ggml-hexagon.h}   |   27 +-
 ggml/src/CMakeLists.txt                       |    2 +-
 ggml/src/ggml-backend-reg.cpp                 |   10 +-
 .../{ggml-qnn => ggml-hexagon}/CMakeLists.txt |   14 +-
 .../ggml-hexagon.cpp}                         | 8075 ++++++++---------
 .../kernels/Makefile                          |    0
 .../kernels/ggmlop_ap_skel.c                  |    0
 .../kernels/ggmlop_ap_skel.h                  |    0
 .../kernels/ggmlop_cdsp.c                     |    0
 .../kernels/ggmlop_cdsp_skel.c                |    0
 scripts/build-run-android.sh                  |   42 +-
 scripts/{ggml-qnn.cfg => ggml-hexagon.cfg}    |    8 +-
 tests/test-backend-ops.cpp                    |    4 -
 15 files changed, 4046 insertions(+), 4142 deletions(-)
 rename ggml/include/{ggml-qnn.h => ggml-hexagon.h} (62%)
 rename ggml/src/{ggml-qnn => ggml-hexagon}/CMakeLists.txt (83%)
 rename ggml/src/{ggml-qnn/ggml-qnn.cpp => ggml-hexagon/ggml-hexagon.cpp} (73%)
 rename ggml/src/{ggml-qnn => ggml-hexagon}/kernels/Makefile (100%)
 rename ggml/src/{ggml-qnn => ggml-hexagon}/kernels/ggmlop_ap_skel.c (100%)
 rename ggml/src/{ggml-qnn => ggml-hexagon}/kernels/ggmlop_ap_skel.h (100%)
 rename ggml/src/{ggml-qnn => ggml-hexagon}/kernels/ggmlop_cdsp.c (100%)
 rename ggml/src/{ggml-qnn => ggml-hexagon}/kernels/ggmlop_cdsp_skel.c (100%)
 rename scripts/{ggml-qnn.cfg => ggml-hexagon.cfg} (86%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2148b436d2afc..c5903c112b944 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -129,7 +129,7 @@ llama_option_depr(WARNING     LLAMA_RPC                 GGML_RPC)
 llama_option_depr(WARNING     LLAMA_SYCL                GGML_SYCL)
 llama_option_depr(WARNING     LLAMA_SYCL_F16            GGML_SYCL_F16)
 llama_option_depr(WARNING     LLAMA_CANN                GGML_CANN)
-llama_option_depr(WARNING     LLAMA_QNN                 GGML_QNN)
+llama_option_depr(WARNING     LLAMA_HEXAGON             GGML_HEXAGON)
 
 if (NOT MSVC)
     if (LLAMA_SANITIZE_THREAD)
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt
index b12a4fa47c420..31bf928a06a80 100644
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -204,7 +204,7 @@ option(GGML_OPENCL_EMBED_KERNELS            "ggml: embed kernels"
 option(GGML_OPENCL_USE_ADRENO_KERNELS       "ggml: use optimized kernels for Adreno"          ON)
 set   (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
                                             "gmml: OpenCL API version to target")
-option(GGML_QNN                             "ggml: use QNN"                                   OFF)
+option(GGML_HEXAGON                         "ggml: use HEXAGON"                               OFF)
 
 # toolchain for vulkan-shaders-gen
 set   (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
@@ -270,7 +270,7 @@ set(GGML_PUBLIC_HEADERS
     include/ggml-rpc.h
     include/ggml-sycl.h
     include/ggml-vulkan.h
-    include/ggml-qnn.h
+    include/ggml-hexagon.h
     include/gguf.h)
 
 set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
diff --git a/ggml/include/ggml-qnn.h b/ggml/include/ggml-hexagon.h
similarity index 62%
rename from ggml/include/ggml-qnn.h
rename to ggml/include/ggml-hexagon.h
index 63d136c5e52b9..8e37f7da73adf 100644
--- a/ggml/include/ggml-qnn.h
+++ b/ggml/include/ggml-hexagon.h
@@ -28,25 +28,26 @@
 extern "C" {
 #endif
 
-#define GGML_QNN_MAX_DEVICES    3
-#define GGML_QNN_BACKEND_NAME   "hexagon"
-
-enum QNNBackend {
-    QNN_BACKEND_CPU,
-    QNN_BACKEND_GPU,
-    QNN_BACKEND_NPU,
-    QNN_BACKEND_GGML, //"fake" QNN backend for compare performance between QNN backend and cpu backend
+#define GGML_HEXAGON_MAX_DEVICES    3
+#define GGML_HEXAGON_BACKEND_NAME   "hexagon"
+
+enum HEXAGONBackend {
+    HEXAGON_BACKEND_QNNCPU  = 0,
+    HEXAGON_BACKEND_QNNGPU  = 1,
+    HEXAGON_BACKEND_QNNNPU  = 2,
+    HEXAGON_BACKEND_CDSP    = 2,
+    HEXAGON_BACKEND_GGML    = 3, //"fake" QNN backend for compare performance between HEXAGON backend and ggml backend
 };
 
-GGML_BACKEND_API ggml_backend_t ggml_backend_qnn_init(size_t dev_num, const char * qnn_lib_path);
+GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(size_t dev_num, const char * qnn_lib_path);
 
-GGML_BACKEND_API bool           ggml_backend_is_qnn(ggml_backend_t backend);
+GGML_BACKEND_API bool           ggml_backend_is_hexagon(ggml_backend_t backend);
 
-GGML_BACKEND_API int            ggml_backend_qnn_get_device_count(void);
+GGML_BACKEND_API int            ggml_backend_hexagon_get_device_count(void);
 
-GGML_BACKEND_API ggml_backend_reg_t ggml_backend_qnn_reg(void);
+GGML_BACKEND_API ggml_backend_reg_t ggml_backend_hexagon_reg(void);
 
-const char * ggml_backend_qnn_get_devname(size_t dev_num);
+const char * ggml_backend_hexagon_get_devname(size_t dev_num);
 
 #ifdef __cplusplus
 }
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 8e8cb81bda0a7..5822ddcc2340a 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -313,7 +313,7 @@ ggml_add_backend(RPC)
 ggml_add_backend(SYCL)
 ggml_add_backend(Vulkan)
 ggml_add_backend(OpenCL)
-ggml_add_backend(QNN)
+ggml_add_backend(HEXAGON)
 
 foreach (target ggml-base ggml)
     target_include_directories(${target} PUBLIC    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include> $<INSTALL_INTERFACE:include>)
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index 9030de3cfeef9..e2e334c1de002 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -65,8 +65,8 @@
 #include "ggml-kompute.h"
 #endif
 
-#ifdef GGML_USE_QNN
-#include "ggml-qnn.h"
+#ifdef GGML_USE_HEXAGON
+#include "ggml-hexagon.h"
 #endif
 
 // disable C++17 deprecation warning for std::codecvt_utf8
@@ -191,8 +191,8 @@ struct ggml_backend_registry {
 #ifdef GGML_USE_KOMPUTE
         register_backend(ggml_backend_kompute_reg());
 #endif
-#ifdef GGML_USE_QNN
-        register_backend(ggml_backend_qnn_reg());
+#ifdef GGML_USE_HEXAGON
+        register_backend(ggml_backend_hexagon_reg());
 #endif
 #ifdef GGML_USE_CPU
         register_backend(ggml_backend_cpu_reg());
@@ -584,7 +584,7 @@ void ggml_backend_load_all_from_path(const char * dir_path) {
     ggml_backend_load_best("vulkan", silent, dir_path);
     ggml_backend_load_best("opencl", silent, dir_path);
     ggml_backend_load_best("musa", silent, dir_path);
-    ggml_backend_load_best("qnn", silent, dir_path);
+    ggml_backend_load_best("hexagon", silent, dir_path);
     ggml_backend_load_best("cpu", silent, dir_path);
     // check the environment variable GGML_BACKEND_PATH to load an out-of-tree backend
     const char * backend_path = std::getenv("GGML_BACKEND_PATH");
diff --git a/ggml/src/ggml-qnn/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
similarity index 83%
rename from ggml/src/ggml-qnn/CMakeLists.txt
rename to ggml/src/ggml-hexagon/CMakeLists.txt
index ed6fb2fd85608..7daedaa755c78 100644
--- a/ggml/src/ggml-qnn/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -1,4 +1,4 @@
-project(ggml-qnn)
+project(ggml-hexagon)
 message(STATUS "Using HEXAGON backend")
 message("CMAKE_SYSTEM_NAME : ${CMAKE_SYSTEM_NAME}")
 
@@ -37,8 +37,8 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     include_directories(${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/android_aarch64)
     include_directories(${HEXAGON_SDK_PATH}/libs/atomic/inc)
     include_directories(${HEXAGON_SDK_PATH}/libs/atomic/android_Debug_aarch64/ship)
-    include_directories(${CMAKE_SOURCE_DIR}/ggml/src/ggml-qnn/)
-    include_directories(${CMAKE_SOURCE_DIR}/ggml/src/ggml-qnn/kernels/)
+    include_directories(${CMAKE_SOURCE_DIR}/ggml/src/ggml-hexagon/)
+    include_directories(${CMAKE_SOURCE_DIR}/ggml/src/ggml-hexagon/kernels/)
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
     set(QNN_DEFAULT_LIB_SEARCH_PATH "C:\\" CACHE STRING "customized library search path for QNN backend")
 else()
@@ -49,13 +49,13 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_QNN")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
 
 file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/ggmlop_ap_skel.c")
-ggml_add_backend_library(ggml-qnn ${QNN_SOURCES})
+ggml_add_backend_library(ggml-hexagon ${QNN_SOURCES})
 
-target_include_directories(ggml-qnn PRIVATE ${QNN_SDK_PATH}/include/QNN ${HEXAGON_SDK_PATH} ${CMAKE_CURRENT_LIST_DIR})
-target_link_libraries(ggml-qnn PRIVATE ${QNN_LINK_LIBRARIES})
+target_include_directories(ggml-hexagon PRIVATE ${QNN_SDK_PATH}/include/QNN ${HEXAGON_SDK_PATH} ${CMAKE_CURRENT_LIST_DIR})
+target_link_libraries(ggml-hexagon PRIVATE ${QNN_LINK_LIBRARIES})
 
 string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
-target_compile_definitions(ggml-qnn PRIVATE QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/")
+target_compile_definitions(ggml-hexagon PRIVATE QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/")
 
 function(ggml_hexagon_build_kernel KNAME)
     message(STATUS "ggml_hexagon: build kernel ${KNAME}")
diff --git a/ggml/src/ggml-qnn/ggml-qnn.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
similarity index 73%
rename from ggml/src/ggml-qnn/ggml-qnn.cpp
rename to ggml/src/ggml-hexagon/ggml-hexagon.cpp
index b89b283552acf..b52bf5a0418ce 100644
--- a/ggml/src/ggml-qnn/ggml-qnn.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -3,18 +3,18 @@
  *
  * Qualcomm QNN SDK and reference tech guides could be found at:
  * https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
+ * Qualcomm Hexagon SDK and reference tech guides could be found at:
  * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
  *
- * this single-source-file or self-contained implementation of ggml-hexagon backend has 9 sections:
+ * this single-source-file or self-contained implementation of ggml-hexagon backend has 8 sections:
  * section-1  forward/prototype declaration, global vars, macros, data structures
  * section-2  ggml-qnn internal troubleshooting function/class
  * section-3  helper function for WoA(Windows on ARM)
  * section-4  general helper function
  * section-5  QNN helper function
- * section-6  Hexagon DSP helper function
- * section-7  backend helper function / class
+ * section-6  implementation of hwaccel approach through QNN: offload ggmlop to QNN
+ * section-7  cDSP helper function
  * section-8  implementation of ggml-hexagon backend according to specification in ggml backend subsystem
- * section-9  implementation of hwaccel approach through QNN and Hexagon DSP
  *
  * currently provide following ggml op' implementation through QNN:
  * - GGML_OP_ADD/GGML_OP_SUB/GGML_OP_MUL/GGML_OP_DIV/GGML_OP_LOG/GGML_OP_SQRT:
@@ -22,7 +22,7 @@
  * - GGML_OP_MUL_MAT:
  *   this is a complicated hwaccel skeleton, can expand other ggml ops accordingly
  *
- *  currently provide following ggml op' implementation through Hexagon DSP:
+ *  currently provide following ggml op' implementation through cDSP in hexagon-kernels:
  * - GGML_OP_ADD & GGML_OP_MUL_MAT:
  *   this is a hwaccel skeleton, can expand other ggml ops accordingly
  *
@@ -117,7 +117,7 @@
 #include "HTP/QnnHtpDevice.h"
 #include "HTP/QnnHtpGraph.h"
 
-#include "ggml-qnn.h"
+#include "ggml-hexagon.h"
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 
@@ -127,113 +127,47 @@
 //  section-1: forward/prototype declaration, global vars, macros, data structures
 // =================================================================================================
 class  qnn_instance;
-struct qnn_parameter;
-struct ggml_backend_qnn_context;
-
-static void   ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst);
-static void   ggmlqnn_compute_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value);
-
-static size_t ggmlqnn_get_op_index(const ggml_tensor * tensor);
-static const char * ggmlqnn_get_hwaccel_approach_name(int hwaccle_approach);
-static void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op);
-static void   ggmlqnn_get_opkey_from_op(const ggml_tensor * op, std::string & output);
+struct ggml_backend_hexagon_context;
 
 #if 0//def NDEBUG
-#define GGMLQNN_DEBUG                                   0
+#define GGMLHEXAGON_DEBUG                               0
 #else
-#define GGMLQNN_DEBUG                                   1
+#define GGMLHEXAGON_DEBUG                               1
 #endif
 
-#define GGML_QNN_LOGBUF_LEN                             4096
-#define GGML_QNN_TMPBUF_LEN                             256
+#define GGMLHEXAGON_LOGBUF_LEN                          4096
+#define GGMLHEXAGON_TMPBUF_LEN                          256
 
-#define GGMLQNN_LOG_ERROR(...)                          ggmlqnn_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLQNN_LOG_WARN(...)                           ggmlqnn_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#define GGMLQNN_LOG_INFO(...)                           ggmlqnn_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLHEXAGON_LOG_ERROR(...)                      ggmlhexagon_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLHEXAGON_LOG_WARN(...)                       ggmlhexagon_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLHEXAGON_LOG_INFO(...)                       ggmlhexagon_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 
-#if GGMLQNN_DEBUG
-#define GGMLQNN_LOG_DEBUG(...)                          ggmlqnn_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#if GGMLHEXAGON_DEBUG
+#define GGMLHEXAGON_LOG_DEBUG(...)                      ggmlhexagon_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #else
-#define GGMLQNN_LOG_DEBUG(...)
+#define GGMLHEXAGON_LOG_DEBUG(...)
 #endif
-#define GGMLQNN_DUMP_TENSOR(tensor)                     ggmlqnn_dump_tensor(tensor, #tensor)
 
 #define QNN_VER_PTR(x)                                  (&((x).v1))
 #define RPCMEM_DEFAULT_FLAGS                            1
 #define RPCMEM_HEAP_ID_SYSTEM                           25
 #define STATUS_CONTEXT                                  0x12345678
 
-#define QNN_TENSOR_GET_ID(tensor)                       get_qnn_tensorid(tensor)
-#define QNN_TENSOR_GET_NAME(tensor)                     get_qnn_tensorname(tensor)
-#define QNN_TENSOR_GET_TYPE(tensor)                     get_qnn_tensortype(tensor)
-#define QNN_TENSOR_GET_DATA_FORMAT(tensor)              get_qnn_tensor_dataformat(tensor)
-#define QNN_TENSOR_GET_DATA_TYPE(tensor)                get_qnn_tensor_datatype(tensor)
-#define QNN_TENSOR_GET_QUANT_PARAMS(tensor)             get_qnn_tensor_quantparams(tensor)
-#define QNN_TENSOR_GET_RANK(tensor)                     get_qnn_tensor_rank(tensor)
-#define QNN_TENSOR_GET_DIMENSIONS(tensor)               get_qnn_tensor_dimensions(tensor)
-#define QNN_TENSOR_GET_MEM_TYPE(tensor)                 get_qnn_tensor_memtype(tensor)
-#define QNN_TENSOR_GET_CLIENT_BUF(tensor)               get_qnn_tensor_clientbuf(tensor)
-#define QNN_TENSOR_GET_MEM_HANDLE(tensor)               get_qnn_tensor_memhandle(tensor)
-
-#define QNN_TENSOR_SET_ID(tensor, value)                set_qnn_tensor_id(tensor, value)
-#define QNN_TENSOR_SET_NAME(tensor, value)              set_qnn_tensor_name(tensor, value)
-#define QNN_TENSOR_SET_TYPE(tensor, value)              set_qnn_tensor_type(tensor, value)
-#define QNN_TENSOR_SET_DATA_FORMAT(tensor, value)       set_qnn_tensor_dataformat(tensor, value)
-#define QNN_TENSOR_SET_DATA_TYPE(tensor, value)         set_qnn_tensor_datatype(tensor, value)
-#define QNN_TENSOR_SET_QUANT_PARAMS(tensor, value)      set_qnn_tensor_quantparams(tensor, value)
-#define QNN_TENSOR_SET_RANK(tensor, value)              set_qnn_tensor_rank(tensor, value)
-#define QNN_TENSOR_SET_DIMENSIONS(tensor, value)        set_qnn_tensor_dimensions(tensor, value)
-#define QNN_TENSOR_SET_MEM_TYPE(tensor, value)          set_qnn_tensor_memtype(tensor, value)
-#define QNN_TENSOR_SET_CLIENT_BUF(tensor, value)        set_qnn_tensor_clientbuf(tensor, value)
-#define QNN_TENSOR_SET_MEM_HANDLE(tensor, value)        set_qnn_tensor_memhandle(tensor, value)
-
-#define DISABLE_COPY(class_name)                                                \
-    class_name(const class_name &) = delete;                                    \
-    void operator=(const class_name &) = delete
-
-#define DISABLE_MOVE(class_name)                                                \
-    class_name(class_name &&) = delete;                                         \
-    void operator=(class_name &&) = delete
-
 #define CHECK_QNN_API(error, result)                                            \
     do {                                                                        \
         error = (result);                                                       \
         if (QNN_SUCCESS != error) {                                             \
             if (error == QNN_COMMON_ERROR_NOT_SUPPORTED) {                      \
-                GGMLQNN_LOG_WARN("WARNING: QNN feature/API not supported\n");   \
+                GGMLHEXAGON_LOG_WARN("WARNING: QNN feature/API not supported\n");   \
             } else {                                                            \
-                GGMLQNN_LOG_INFO("QNN API error = %d(%s)\n", error, ggmlqnn_get_qnnerror_string(error));  \
+                GGMLHEXAGON_LOG_INFO("QNN API error = %d(%s)\n", error, ggmlqnn_get_qnnerror_string(error));  \
             }                                                                   \
         }                                                                       \
     } while (0)
 
 #define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst)                              \
     do {                                                                        \
-        if (g_qnn_params.hwaccel_approach != HWACCEL_CDSP) {                    \
+        if (g_hexagon_appcfg.hwaccel_approach != HWACCEL_CDSP) {                    \
             if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) {       \
                 return;                                                         \
             }                                                                   \
@@ -257,7 +191,7 @@ using qnn_tensors_t                             = std::vector< Qnn_Tensor_t >;
 using qnn_ptensors_t                            = std::vector< Qnn_Tensor_t *>;
 using qnn_singlenode_res_t                      = std::tuple<Qnn_GraphHandle_t, qnn_ptensors_t>;
 
-typedef void (* ggmlqnn_op_func_t)(ggml_backend_qnn_context * ctx, ggml_tensor * op);
+typedef void (* ggmlqnn_op_func_t)(ggml_backend_hexagon_context * ctx, ggml_tensor * op);
 typedef int  (* notify_callback_fn)(void * context, int domain, int session, remote_rpc_status_flags_t status);
 typedef int  (* ggmlhexagon_op_func_t)(remote_handle64 handle, const dsptensor * src0, const dsptensor * src1, dsptensor * dst);
 
@@ -321,7 +255,7 @@ struct qcom_socinfo {
     char soc_desc[GGML_MAX_NAME];
 };
 
-struct ggml_backend_qnn_context {
+struct ggml_backend_hexagon_context {
     int device;
     char name[GGML_MAX_NAME];
     char desc[GGML_MAX_NAME];
@@ -349,6 +283,26 @@ struct ggml_backend_qnn_context {
     int domain_id;
 };
 
+struct ggml_backend_hexagon_buffer_context {
+    ~ggml_backend_hexagon_buffer_context() {
+        if (buffer) {
+            ggml_aligned_free(buffer, 0);
+        }
+
+        for (auto * sub_buffer : sub_buffers) {
+            free(sub_buffer);
+        }
+
+        sub_buffers.clear();
+    }
+    void * buffer       = nullptr;
+
+    struct ggml_backend_hexagon_context * backend_ctx = nullptr;
+
+    size_t buffer_size  = 0;
+    std::vector<void *> sub_buffers;
+};
+
 struct qnn_op_caps {
     bool supported;
     ggml_op op;
@@ -364,7 +318,7 @@ struct hexagon_op_caps {
     ggmlhexagon_op_func_t dsp_op_func;
 };
 
-struct qnn_parameter {
+struct hexagon_appcfg_t {
     int print_qnn_internal_log; // enable/disable QNN's internal log
     int enable_perf;            // enable/disable perf of op function
     int print_tensors_info;     // enable/disable print tensors info in op function
@@ -374,17 +328,14 @@ struct qnn_parameter {
     int vtcm_size_in_mb;
     int enable_dlbc;
     int hwaccel_approach;       // 0: HWACCEL_QNN 1: HWACCEL_QNN_SINGLEGRAPH 2: HWACCEL_CDSP
-    int qnn_backend;            // 0: QNN-CPU backend 1: QNN-GPU backend 2: QNN-NPU backend
+    int hexagon_backend;        // 0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP
     int enable_mulmat_cdsp;     // enable/disable offload mulmat to cDSP
     int enable_q_mulmat;        // enable/disable offload fp32 & all quantized type mulmat to cDSP
-    const char * qnn_cfgfilename;
-    const char * qnn_runtimelib_path;
+    const char * cfgfilename;
+    const char * runtimelib_path;
 };
 
-static int32_t g_qnntensor_idx = 0; //ensure every QNN tensor name is unique
-static int32_t g_qnnopcfg_idx  = 0; //ensure every QNN opconfig name is unique
-
-static struct qnn_parameter g_qnn_params = {
+static struct hexagon_appcfg_t g_hexagon_appcfg = {
         .print_qnn_internal_log = 0,
         .enable_perf            = 0,
         .print_tensors_info     = 0,
@@ -394,13 +345,13 @@ static struct qnn_parameter g_qnn_params = {
         .vtcm_size_in_mb        = 8,
         .enable_dlbc            = 1,
         .hwaccel_approach       = HWACCEL_CDSP,
-        .qnn_backend            = QNN_BACKEND_NPU,
+        .hexagon_backend        = HEXAGON_BACKEND_CDSP,
         .enable_mulmat_cdsp     = 0,
         .enable_q_mulmat        = 0,
-        .qnn_cfgfilename        = "ggml-qnn.cfg",
+        .cfgfilename            = "ggml-hexagon.cfg",
 #if defined(__ANDROID__)
 //Android command line program
-        .qnn_runtimelib_path    = "/data/local/tmp/",
+        .runtimelib_path        = "/data/local/tmp/",
 #elif defined(__linux__)
         .qnn_runtimelib_path    = "/tmp/",
 #elif defined(_WIN32)
@@ -490,8 +441,8 @@ static struct qcom_socinfo g_qnn_soc_info_table[] = {
 // HTP - Choose a quantized model. Quantized models are required when running on the HTP backend
 // DSP - Choose a quantized model. Quantized models are required when running on the DSP backend
 // HTA - Choose a quantized model. Quantized models are required when running on the HTA backend
-static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
-        [QNN_BACKEND_CPU] = {.device               = 0,
+static struct ggml_backend_hexagon_context g_hexagon_mgr[GGML_HEXAGON_MAX_DEVICES] = {
+        [HEXAGON_BACKEND_QNNCPU] = {.device               = 0,
                 .name                 = "qnn-cpu",
                 .desc                 = "Qualcomm Kryo CPU",
 #if !defined(__ANDROID__) && !defined(__linux__)
@@ -505,7 +456,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .raw_system_interface = {},
                 .socinfo              = {}},
 
-        [QNN_BACKEND_GPU] = {.device               = 1,
+        [HEXAGON_BACKEND_QNNGPU] = {.device               = 1,
                 .name                 = "qnn-gpu",
                 .desc                 = "Qualcomm Adreno GPU",
 #if !defined(__ANDROID__) && !defined(__linux__)
@@ -519,7 +470,7 @@ static struct ggml_backend_qnn_context g_qnn_mgr[GGML_QNN_MAX_DEVICES] = {
                 .raw_system_interface = {},
                 .socinfo              = {}},
 
-        [QNN_BACKEND_NPU] = {.device               = 2,
+        [HEXAGON_BACKEND_QNNNPU] = {.device               = 2,
                 .name                 = "qnn-npu",
                 .desc                 = "Qualcomm NPU(Hexagon Tensor Processor)",
 #if !defined(__ANDROID__) && !defined(__linux__)
@@ -663,8 +614,8 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
         {true,  GGML_OP_MUL, 2, "ggmlop_dsp_mul", ggmlop_dsp_mul},
         {true,  GGML_OP_DIV, 2, "ggmlop_dsp_div", ggmlop_dsp_div},
         {false, GGML_OP_SQR},
-        {false,  GGML_OP_SQRT},
-        {false,  GGML_OP_LOG},
+        {false,  GGML_OP_SQRT, 1},
+        {false,  GGML_OP_LOG, 1},
         {false, GGML_OP_SIN},
         {false, GGML_OP_COS},
         {false, GGML_OP_SUM},
@@ -762,49 +713,52 @@ static_assert(ggmlhexagon_k_op_caps[GGML_OP_MUL_MAT].supported, "GGML_OP_MUL_MAT
 static_assert(std::size(ggmlhexagon_k_op_caps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
               "pls check ggmlhexagon_k_op_caps and ensure is corresponding to latest ggml.h");
 
+static int32_t g_qnntensor_idx = 0; //ensure every QNN tensor name is unique
+static int32_t g_qnnopcfg_idx  = 0; //ensure every QNN opconfig name is unique
+
 // =================================================================================================
-//  section-2: ggml-qnn internal troubleshooting function/class
+//  section-2: ggml-hexagon internal troubleshooting function/class
 // =================================================================================================
-static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
-    static std::mutex ggmlqnn_log_internal_mutex;
-    static char s_ggmlqnn_log_internal_buf[GGML_QNN_LOGBUF_LEN];
+static void ggmlhexagon_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
+    static std::mutex ggmlhexagon_log_internal_mutex;
+    static char s_ggmlhexagon_log_internal_buf[GGMLHEXAGON_LOGBUF_LEN];
 
     GGML_UNUSED(file);
 #if !(defined __ANDROID__) || !(defined ANDROID)
     GGML_UNUSED(level);
 #endif
     {
-        std::lock_guard<std::mutex> lock(ggmlqnn_log_internal_mutex);
+        std::lock_guard<std::mutex> lock(ggmlhexagon_log_internal_mutex);
         va_list args;
         va_start(args, format);
-        int len_prefix = snprintf(s_ggmlqnn_log_internal_buf, GGML_QNN_LOGBUF_LEN, "[%s, %d]: ", func, line);
-        int len = vsnprintf(s_ggmlqnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args);
-        if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) {
+        int len_prefix = snprintf(s_ggmlhexagon_log_internal_buf, GGMLHEXAGON_LOGBUF_LEN, "[%s, %d]: ", func, line);
+        int len = vsnprintf(s_ggmlhexagon_log_internal_buf + len_prefix, GGMLHEXAGON_LOGBUF_LEN - len_prefix, format, args);
+        if (len < (GGMLHEXAGON_LOGBUF_LEN - len_prefix)) {
 #if (defined __ANDROID__) || (defined ANDROID)
-            __android_log_print(ANDROID_LOG_INFO, "ggml-qnn", "%s\n", s_ggmlqnn_log_internal_buf);
+            __android_log_print(ANDROID_LOG_INFO, "ggml-qnn", "%s\n", s_ggmlhexagon_log_internal_buf);
             if (GGML_LOG_LEVEL_INFO == level) {
-                printf("%s\n", s_ggmlqnn_log_internal_buf);
+                printf("%s\n", s_ggmlhexagon_log_internal_buf);
             }
 #else
             //for Snapdragon based WoA(Windows on ARM) device or Linux
-            printf("%s\n", s_ggmlqnn_log_internal_buf);
+            printf("%s\n", s_ggmlhexagon_log_internal_buf);
 #endif
         }
         va_end(args);
     }
 }
 
-static void ggmlqnn_print_tensors_info(const char * func_name, const ggml_backend_qnn_context * ctx,
+static void ggmlhexagon_print_tensors_info(const char * func_name, const ggml_backend_hexagon_context * ctx,
                 const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
     //skip sanity check of params because of performance concern
-    if (0 == g_qnn_params.print_tensors_info)
+    if (0 == g_hexagon_appcfg.print_tensors_info)
         return;
 
     if (nullptr != func_name && nullptr != ctx) {
-        GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
+        GGMLHEXAGON_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
     }
     if (nullptr != src0) {
-        GGMLQNN_LOG_DEBUG(
+        GGMLHEXAGON_LOG_DEBUG(
                 "%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
                 src0->name,
                 src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
@@ -812,33 +766,33 @@ static void ggmlqnn_print_tensors_info(const char * func_name, const ggml_backen
                 src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
     }
     if (nullptr != src1) {
-        GGMLQNN_LOG_DEBUG(
+        GGMLHEXAGON_LOG_DEBUG(
                 "%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
                 src1->name,
                 src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
                 src1->ne[3],
                 src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
     }
-    GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
+    GGMLHEXAGON_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
                       dst->name,
                       dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
                       dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
-    GGMLQNN_LOG_DEBUG("\n");
+    GGMLHEXAGON_LOG_DEBUG("\n");
 }
 
-static void ggmlqnn_dump_op_info(const struct ggml_tensor * tensor) {
+static void ggmlhexagon_dump_op_info(const struct ggml_tensor * tensor) {
     //skip sanity check of params because of performance concern
-    if (0 == g_qnn_params.dump_op_info)
+    if (0 == g_hexagon_appcfg.dump_op_info)
         return;
 
     const struct ggml_tensor * src0 = tensor->src[0];
     struct ggml_tensor       * src1 = tensor->src[1];
     struct ggml_tensor       * dst  = const_cast<ggml_tensor *>(tensor);
-    GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), ggml_type_name(tensor->type));
-    ggmlqnn_print_tensors_info(nullptr, nullptr, src0, src1, dst);
+    GGMLHEXAGON_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), ggml_type_name(tensor->type));
+    ggmlhexagon_print_tensors_info(nullptr, nullptr, src0, src1, dst);
 }
 
-static void ggmlqnn_dump_tensor_elements(const ggml_tensor * tensor) {
+static void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) {
     float value = 0;
     std::ostringstream tmposs;
     if (tensor->type == GGML_TYPE_F32) {
@@ -851,8 +805,8 @@ static void ggmlqnn_dump_tensor_elements(const ggml_tensor * tensor) {
                         tmposs << std::setw(8) << std::fixed << std::setprecision(2) << value
                                << " ";
                     }
-                    if (strlen(tmposs.str().c_str()) <= (GGML_QNN_LOGBUF_LEN - 96)) {
-                        GGMLQNN_LOG_DEBUG("%s\n", tmposs.str().c_str());
+                    if (strlen(tmposs.str().c_str()) <= (GGMLHEXAGON_LOGBUF_LEN - 96)) {
+                        GGMLHEXAGON_LOG_DEBUG("%s\n", tmposs.str().c_str());
                     }
                     tmposs.clear();
                     tmposs.str("");
@@ -861,22 +815,36 @@ static void ggmlqnn_dump_tensor_elements(const ggml_tensor * tensor) {
         }
     }
 
-    GGMLQNN_LOG_DEBUG("\n");
+    GGMLHEXAGON_LOG_DEBUG("\n");
 }
 
-static void ggmlqnn_dump_tensor(const ggml_tensor * tensor, const char * name) {
-    GGMLQNN_LOG_DEBUG("dump ggml tensor %s(%s)\n", name, tensor->name);
-    GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n",
+static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, const char * name) {
+    GGMLHEXAGON_LOG_DEBUG("dump ggml tensor %s(%s)\n", name, tensor->name);
+    GGMLHEXAGON_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64", nb = (%5zi, %5zi, %5zi, %5zi)\n",
                       name,
                       tensor->type, ggml_type_name(tensor->type),
                       tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
                       tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[2]);
-    ggmlqnn_dump_tensor_elements(tensor);
+    ggmlhexagon_dump_tensor_elements(tensor);
 
-    GGMLQNN_LOG_DEBUG("\n");
+    GGMLHEXAGON_LOG_DEBUG("\n");
+}
+
+static const char * ggmlhexagon_get_hwaccel_approach_name(int hwaccle_approach) {
+    switch (hwaccle_approach) {
+        case HWACCEL_QNN:
+            return "HWACCEL_QNN";
+        case HWACCEL_QNN_SINGLEGRAPH:
+            return "HWACCEL_QNN_SINGLEGRAPH";
+        case HWACCEL_CDSP:
+            return "HWACCEL_CDSP";
+        default:
+            return "unknown hwaccel approach";
+    }
 }
 
-static void ggmlqnn_get_timestring(char * p_currenttime) {
+static void ggmlhexagon_get_timestring(char * p_currenttime) {
+#if defined(__ANDROID__) || defined(__linux__)
     time_t n_seconds    = 0;
     struct tm * p_tm    = nullptr;
 
@@ -885,43 +853,47 @@ static void ggmlqnn_get_timestring(char * p_currenttime) {
 
     time(&n_seconds);
     p_tm = localtime(&n_seconds);
-    snprintf(p_currenttime, GGML_QNN_TMPBUF_LEN, "%04d-%02d-%02d,%02d:%02d:%02d",
+    snprintf(p_currenttime, GGMLHEXAGON_TMPBUF_LEN, "%04d-%02d-%02d,%02d:%02d:%02d",
              p_tm->tm_year + 1900, p_tm->tm_mon + 1, p_tm->tm_mday,
              p_tm->tm_hour, p_tm->tm_min, p_tm->tm_sec);
+#else
+    //TODO: WoA
+#endif
 }
 
-static void ggmlqnn_print_running_timestamp(ggml_backend_qnn_context * ctx) {
-    GGMLQNN_LOG_INFO("hwaccel approach is %d(%s)", g_qnn_params.hwaccel_approach,
-                     ggmlqnn_get_hwaccel_approach_name(g_qnn_params.hwaccel_approach));
-    char timestamp[GGML_QNN_TMPBUF_LEN];
-    ggmlqnn_get_timestring(timestamp);
-    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
-        GGMLQNN_LOG_INFO("only offload GGML_OP_ADD : %s", g_qnn_params.enable_q_mulmat ? "NO" : "YES");
+static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * ctx) {
+    GGMLHEXAGON_LOG_INFO("hwaccel approach is %d(%s)", g_hexagon_appcfg.hwaccel_approach,
+                     ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
+    char timestamp[GGMLHEXAGON_TMPBUF_LEN];
+    memset(timestamp, 0, GGMLHEXAGON_TMPBUF_LEN);
+    ggmlhexagon_get_timestring(timestamp);
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        GGMLHEXAGON_LOG_INFO("only offload GGML_OP_ADD : %s", g_hexagon_appcfg.enable_q_mulmat ? "NO" : "YES");
     } else {
-        GGMLQNN_LOG_INFO("only offload GGML_OP_ADD: NO");
+        GGMLHEXAGON_LOG_INFO("only offload GGML_OP_ADD: NO");
     }
-    GGMLQNN_LOG_INFO("running timestamp:%s", timestamp);
+    GGMLHEXAGON_LOG_INFO("running timestamp:%s", timestamp);
 }
 
-class qnn_perf {
+class hexagon_perf {
 public:
-    qnn_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
-    qnn_perf() = delete;
-    qnn_perf(const qnn_perf & ) = delete;
-    qnn_perf & operator= (const qnn_perf & ) = delete;
+    hexagon_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
+    hexagon_perf() = delete;
+    hexagon_perf(const hexagon_perf & ) = delete;
+    hexagon_perf & operator= (const hexagon_perf & ) = delete;
 
     void start() {
-        if (0 == g_qnn_params.enable_perf)
+        if (0 == g_hexagon_appcfg.enable_perf)
             return;
         _begin_time = ggml_time_us();
     }
 
     void info() {
-        if (0 == g_qnn_params.enable_perf)
+        if (0 == g_hexagon_appcfg.enable_perf)
             return;
         _end_time = ggml_time_us();
         _duration = (_end_time - _begin_time);
-        GGMLQNN_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
+        GGMLHEXAGON_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
     }
 
 private:
@@ -931,6 +903,146 @@ class qnn_perf {
     std::string _perf_name;
 };
 
+class hexagon_appcfg {
+public:
+    void dump(std::function<void(const std::string &, const std::string &, const std::string &)> worker) {
+        if (!_load_success) {
+            GGMLHEXAGON_LOG_INFO("qnn cfg file %s not loaded", _cfg_filename.c_str());
+            return;
+        }
+        auto iter = _hexagon_appcfg.begin();
+        while (iter != _hexagon_appcfg.end()) {
+            auto kv_iter = iter->second.begin();
+            while (kv_iter != iter->second.end()) {
+                worker(iter->first, kv_iter->first, kv_iter->second);
+                ++kv_iter;
+            }
+            ++iter;
+        }
+    }
+
+    bool load(const std::string & file_name) {
+        if (file_name == "") {
+            return false;
+        }
+        _cfg_filename = file_name;
+        std::ifstream in;
+        std::string line;
+        in.open(file_name.c_str());
+        if (not in.is_open()) {
+            GGMLHEXAGON_LOG_WARN("can't open file %s", file_name.c_str());
+            return false;
+        }
+        while (getline(in, line)) {
+            std::string section, key, value;
+            if (not parse_line(line, section, key, value)) {
+                continue;
+            }
+            set_section_keyvalue(section, key, value);
+        }
+        _load_success = true;
+        return true;
+    }
+
+    void get_stringvalue(const std::string & section, const std::string & key, std::string & value, std::string default_value) {
+        value = default_value;
+        if (_hexagon_appcfg.find(section) == _hexagon_appcfg.end()) {
+            return;
+        }
+        if (_hexagon_appcfg[section].find(key) == _hexagon_appcfg[section].end()) {
+            return;
+        }
+        value = _hexagon_appcfg[section][key];
+    }
+
+    void get_intvalue(const std::string & section, const std::string & key, int & value, int default_value) {
+        value = default_value;
+        if (_hexagon_appcfg.find(section) == _hexagon_appcfg.end()) {
+            return;
+        }
+        if (_hexagon_appcfg[section].find(key) == _hexagon_appcfg[section].end()) {
+            return;
+        }
+        value = atol(_hexagon_appcfg[section][key].c_str());
+    }
+
+private:
+    void ltrim(std::string & str) {
+        if (str.empty()) return;
+        size_t len = 0;
+        char* temp = (char*)str.c_str();
+        while (*temp && isblank(*temp)) {
+            ++len;
+            ++temp;
+        }
+        if (len > 0) str.erase(0, len);
+    }
+
+    void rtrim(std::string & str) {
+        if (str.empty()) return;
+        size_t len = str.length();
+        size_t pos = len;
+        while (pos > 0) {
+            if (not isblank(str[pos - 1])) {
+                break;
+            }
+            --pos;
+        }
+        if (pos != len) str.erase(pos);
+    }
+
+    void trim(std::string& str) {
+        ltrim(str);
+        rtrim(str);
+    }
+
+    void set_section_keyvalue(std::string & section, std::string & key, std::string & value) {
+        if (_hexagon_appcfg.find(section) == _hexagon_appcfg.end()) {
+            std::unordered_map<std::string, std::string> kv_map;
+            _hexagon_appcfg[section] = kv_map;
+        }
+        if (key != "" && value != "") _hexagon_appcfg[section][key] = value;
+    }
+
+    bool parse_line(std::string & line, std::string & section, std::string & key, std::string & value) {
+        static std::string cur_section = "";
+        std::string nodes[2] = {"#", ";"};
+        for (int i = 0; i < 2; ++i) {
+            std::string::size_type pos = line.find(nodes[i]);
+            if (pos != std::string::npos) line.erase(pos);
+        }
+        trim(line);
+        if (line == "") return false;
+        if (line[0] == '[' && line[line.size() - 1] == ']') {
+            section = line.substr(1, line.size() - 2);
+            trim(section);
+            cur_section = section;
+            return false;
+        }
+        if (cur_section == "") return false;
+        bool is_key = true;
+        for (size_t i = 0; i < line.size(); ++i) {
+            if (line[i] == '=') {
+                is_key = false;
+                continue;
+            }
+            if (is_key) {
+                key += line[i];
+            } else {
+                value += line[i];
+            }
+        }
+        section = cur_section;
+        trim(key);
+        trim(value);
+        return true;
+    }
+private:
+    std::unordered_map<std::string, std::unordered_map<std::string, std::string>> _hexagon_appcfg;
+    bool _load_success = false;
+    std::string _cfg_filename;
+};
+
 // =================================================================================================
 //  section-3: helper function for WoA(Window on ARM)
 // =================================================================================================
@@ -989,67 +1101,124 @@ static const char * dlerror(void) {
 // =================================================================================================
 //  section-4: general helper function
 // =================================================================================================
-//ensure every QNN tensor/opcfg name is unique, threadsafe is not required at the moment
-static void ggmlqnn_reset_idx() {
-    g_qnntensor_idx = 0;
-    g_qnnopcfg_idx = 0;
-}
-
-static void ggmlqnn_inc_idx(int idx_type) {
-    switch (idx_type) {
-        case QNN_TENSOR_INDEX:
-            g_qnntensor_idx++;
-            break;
-        case QNN_OPCFG_INDEX:
-            g_qnnopcfg_idx++;
-            break;
+static const char * ggmlhexagon_get_socmodel_desc(uint32_t soc_model) {
+    switch (soc_model) {
+        case SM7450:
+            return "SM7450";
+        case SM8350:
+            return "SM8350";
+        case SM8450:
+            return "SM8450";
+        case SM8475:
+            return "SM8475";
+        case SM8550:
+            return "SM8550";
+        case SM8650:
+            return "SM8650";
+        case SM8750:
+            return "SM8750";
         default:
-            break;
+            return "unknown";
     }
 }
 
-static int32_t ggmlqnn_get_idx(int idx_type) {
-    switch (idx_type) {
-        case QNN_TENSOR_INDEX:
-            return g_qnntensor_idx;
-        case QNN_OPCFG_INDEX:
-            return g_qnnopcfg_idx;
+static size_t ggmlhexagon_htparch_hex_to_decimal(size_t htp_arch) {
+    //naive algorithm
+    int a = htp_arch / 16;
+    int b = htp_arch % 16;
+    return a * 10 + b;
+}
+
+static const char * ggmlhexagon_get_htparch_desc(size_t htp_arch) {
+    switch (htp_arch) {
+        case V68:
+            return "QCOM_HTP_V68";
+        case V69:
+            return "QCOM_HTP_V69";
+        case V73:
+            return "QCOM_HTP_V73";
+        case V75:
+            return "QCOM_HTP_V75";
+        case V79:
+            return "QCOM_HTP_V79";
         default:
-            break;
+            return "unknown";
     }
 }
 
-static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
-    return offset % alignment == 0 ? offset
-                                   : offset +
-                                     (static_cast<intptr_t>(alignment) -
-                                      offset % static_cast<intptr_t>(alignment));
+static struct qcom_socinfo * ggmlhexagon_get_socinfo_from_socmodel(uint32_t soc_model) {
+    size_t items = sizeof(g_qnn_soc_info_table) / sizeof(g_qnn_soc_info_table[0]);
+    for (size_t idx = 0; idx < items; idx++) {
+        if (soc_model == g_qnn_soc_info_table[idx].soc_model) {
+            return &g_qnn_soc_info_table[idx];
+        }
+    }
+    return nullptr;
 }
 
-static size_t ggmlqnn_get_system_total_memory_in_bytes() {
-#if defined(__ANDROID__) || defined(__linux__)
-    struct sysinfo info = {};
-    if (0 == sysinfo(&info)) {
-        return (info.totalram + info.totalswap) * info.mem_unit;
+static struct qcom_socinfo * ggmlhexagon_get_socinfo_from_socmodel(size_t htp_arch) {
+    size_t items = sizeof(g_qnn_soc_info_table) / sizeof(g_qnn_soc_info_table[0]);
+    for (size_t idx = 0; idx < items; idx++) {
+        if (htp_arch == g_qnn_soc_info_table[idx].htp_arch) {
+            return &g_qnn_soc_info_table[idx];
+        }
     }
-    size_t pages      = (size_t)sysconf(_SC_PHYS_PAGES);
-    size_t page_size  = (size_t)sysconf(_SC_PAGE_SIZE);
+    return nullptr;
+}
 
-    return pages * page_size;
-#else
-    //FIXME: Snapdragon based WoA(Windows on ARM)
-    MEMORYSTATUSEX statex;
-    statex.dwLength = sizeof(statex);
-    if (GlobalMemoryStatusEx(&statex)) {
-        GGMLQNN_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20);
-        GGMLQNN_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20);
-        return statex.ullTotalPhys;
+static inline uint32_t ggmlqnn_get_tensor_data_size(const ggml_tensor * tensor) {
+    /*
+    size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
+    size_t n_dims = ggml_get_tensor_rank(tensor);
+    for (int i = 1; i < n_dims; i++) {
+        data_size *= tensor->ne[i];
     }
-    return 0;
-#endif
-}
 
-static size_t ggmlqnn_get_system_free_memory_in_bytes() {
+    return data_size;
+    */
+    return ggml_nbytes(tensor);
+}
+
+static inline bool ggmlqnn_is_valid_params(ggml_backend_hexagon_context * ctx, const ggml_tensor * src0,
+                                           const ggml_tensor * src1, ggml_tensor * dst) {
+    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == dst)) {
+        GGMLHEXAGON_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    qnn_instance * instance = ctx->instance;
+    if (nullptr == instance) {
+        GGMLHEXAGON_LOG_WARN("invalid params\n");
+        return false;
+    }
+
+    return true;
+}
+
+static size_t ggmlhexagon_get_system_total_memory_in_bytes() {
+#if defined(__ANDROID__) || defined(__linux__)
+    struct sysinfo info = {};
+    if (0 == sysinfo(&info)) {
+        return (info.totalram + info.totalswap) * info.mem_unit;
+    }
+    size_t pages      = (size_t)sysconf(_SC_PHYS_PAGES);
+    size_t page_size  = (size_t)sysconf(_SC_PAGE_SIZE);
+
+    return pages * page_size;
+#else
+    //FIXME: Snapdragon based WoA(Windows on ARM)
+    MEMORYSTATUSEX statex;
+    statex.dwLength = sizeof(statex);
+    if (GlobalMemoryStatusEx(&statex)) {
+        GGMLHEXAGON_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20);
+        GGMLHEXAGON_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20);
+        return statex.ullTotalPhys;
+    }
+    return 0;
+#endif
+}
+
+static size_t ggmlhexagon_get_system_free_memory_in_bytes() {
 #if defined(__ANDROID__) || defined(__linux__)
     struct sysinfo info = {};
     if (0 == sysinfo(&info)) {
@@ -1064,14 +1233,280 @@ static size_t ggmlqnn_get_system_free_memory_in_bytes() {
     MEMORYSTATUSEX statex;
     statex.dwLength = sizeof(statex);
     if (GlobalMemoryStatusEx(&statex)) {
-        GGMLQNN_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20);
-        GGMLQNN_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20);
+        GGMLHEXAGON_LOG_INFO("total physical mem:%llu Mb", statex.ullTotalPhys >> 20);
+        GGMLHEXAGON_LOG_INFO("avail physical mem:%llu Mb", statex.ullAvailPhys >> 20);
         return statex.ullAvailPhys;
     }
     return 0;
 #endif
 }
 
+static bool ggmlhexagon_same_types(const ggml_backend_hexagon_context * ctx, const ggml_tensor * op_tensor) {
+    GGML_UNUSED(ctx);
+    ggml_tensor * src0 = op_tensor->src[0];
+    ggml_tensor * src1 = op_tensor->src[1];
+    if (nullptr != src1) {
+        if (src0->type != op_tensor->type || src1->type != op_tensor->type) {
+            return false;
+        }
+    } else {
+        if (src0->type != op_tensor->type) {
+            return false;
+        }
+    }
+
+    if (src0->type != GGML_TYPE_F32)
+        return false;
+
+    return true;
+}
+
+static const char * ggmlhexagon_get_ggml_type_name(ggml_type type) {
+    const auto * traits = ggml_get_type_traits(type);
+    return traits->type_name;
+}
+
+static void ggmlhexagon_append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) {
+    char buffer[GGMLHEXAGON_TMPBUF_LEN] = {};
+    const char * type_name = ggmlhexagon_get_ggml_type_name(tensor->type);
+    int len = 0;
+    switch (ggml_n_dims(tensor)) {
+        case 1:
+            len = snprintf(buffer, sizeof(buffer), "%ldx1%s", (long)tensor->ne[0], type_name);
+            break;
+        case 2:
+            len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name);
+            break;
+        case 3:
+            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
+                           (long)tensor->ne[2], type_name);
+            break;
+        case 4:
+        default:
+            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
+                           (long)tensor->ne[2], (long)tensor->ne[3], type_name);
+            break;
+    }
+    GGML_ASSERT(len > 0 && len < (int)sizeof(buffer));
+    output.append(buffer, len);
+}
+
+static size_t ggmlhexagon_get_op_index(const ggml_tensor * tensor) {
+    if (tensor->op == GGML_OP_UNARY) {
+        return static_cast<size_t>(GGML_OP_COUNT) + static_cast<size_t>(ggml_get_unary_op(tensor));
+    }
+
+    return tensor->op;
+}
+
+static size_t ggmlhexagon_get_op_input_param_count(const ggml_tensor * op) {
+    auto op_index = ggmlhexagon_get_op_index(op);
+    GGML_ASSERT(op_index < std::size(ggmlqnn_k_op_caps));
+    return ggmlhexagon_k_op_caps[op_index].input_param_count;
+}
+
+static void ggmlhexagon_get_opkey_from_op(const ggml_tensor * op, std::string & output) {
+    GGML_ASSERT(op->op != GGML_OP_NONE);
+    output += ggml_op_desc(op);
+    output += ggmlhexagon_get_ggml_type_name(op->type);
+    size_t param_count = ggmlhexagon_get_op_input_param_count(op);
+    for (size_t i = 0; i < param_count; ++i) {
+        auto * input = op->src[i];
+        if (!input) {
+            break;
+        }
+        output += '_';
+        ggmlhexagon_append_tensor_dimensions(input, output);
+    }
+}
+
+static void * ggmlhexagon_type_trait(ggml_backend_hexagon_context * ctx, ggml_tensor * op) {
+    const ggml_tensor * src0        = op->src[0];
+    const ggml_tensor * src1        = op->src[1];
+    ggml_tensor * dst               = op;
+    const enum ggml_type src0_type  = src0->type;
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+    GGML_ASSERT(nb00 == ggml_type_size(src0_type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    const int64_t ne_plane = ne01 * ne00;
+    const size_t desired_size = ((GGML_TYPE_F32 == src0_type) ? 0 : ne03 * ne02 * ne_plane * sizeof(float));
+    ctx->desired_size   = desired_size;
+    if (ctx->work_size < desired_size) {
+        ctx->work_data.reset(new char[desired_size]);
+        ctx->work_size  = desired_size;
+    }
+    ctx->n_threads = std::thread::hardware_concurrency();
+    void * wdata = ctx->work_data.get();
+    // convert src0 to float
+    if (src0_type != GGML_TYPE_F32) {
+        const auto * type_traits        = ggml_get_type_traits(src0_type);
+        ggml_to_float_t const to_float  = type_traits->to_float;
+
+        for (int64_t i03 = 0; i03 < ne03; i03++) {
+            for (int64_t i02 = 0; i02 < ne02; i02++) {
+                const void * x          = (char *)src0->data + i02 * nb02 + i03 * nb03;
+                float * const wplane    = (float *)wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
+
+                const int min_cols_per_thread = 4096;
+                const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1);
+                const int n_threads = std::max(
+                        std::min(ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1);
+                for (int i = 1; i < n_threads; i++) {
+                    const int64_t start = i * ne01 / n_threads;
+                    const int64_t end   = (i + 1) * ne01 / n_threads;
+                    if (start < end) {
+                        ctx->tasks.push_back(std::async(std::launch::async, [=]() {
+                            for (int64_t i01 = start; i01 < end; i01++) {
+                                to_float((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
+                            }
+                        }));
+                    }
+                }
+                {
+                    // reuse the current thread for the first task
+                    const int64_t start = 0;
+                    const int64_t end = ne01 / n_threads;
+                    for (int64_t i01 = start; i01 < end; i01++) {
+                        to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
+                    }
+                }
+            }
+        }
+
+        // wait for all tasks to finish
+        for (auto &task: ctx->tasks) {
+            task.get();
+        }
+        ctx->tasks.clear();
+    }
+    return wdata;
+}
+
+static void ggmlhexagon_set_runtime_path(size_t device, const std::string & path) {
+    if ((HEXAGON_BACKEND_QNNNPU == device) || (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach)) {
+        if (0 == setenv("LD_LIBRARY_PATH",
+                        (path +
+                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
+                        1)) {
+            GGMLHEXAGON_LOG_DEBUG("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv successfully");
+        } else {
+            GGMLHEXAGON_LOG_ERROR("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv failure");
+        }
+        if (0 == setenv("ADSP_LIBRARY_PATH",
+                        (path +
+                         ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(),
+                        1)) {
+            GGMLHEXAGON_LOG_DEBUG("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv successfully");
+        } else {
+            GGMLHEXAGON_LOG_ERROR("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv failure");
+        }
+    } else {
+        if (0 == setenv("LD_LIBRARY_PATH",
+                        (path +
+                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
+                        1)) {
+            GGMLHEXAGON_LOG_DEBUG("%s backend setenv successfully\n",
+                                 ggml_backend_hexagon_get_devname(device));
+        } else {
+            GGMLHEXAGON_LOG_ERROR("%s backend setenv failure\n",
+                                  ggml_backend_hexagon_get_devname(device));
+        }
+    }
+}
+
+static void ggmlhexagon_load_cfg() {
+    //this function can be called in various scenarios
+    static bool initialized = false;
+    if (initialized) {
+        GGMLHEXAGON_LOG_DEBUG("hexagon appcfg file already loaded\n");
+        return;
+    }
+    char time_string[GGMLHEXAGON_TMPBUF_LEN];
+    memset(time_string, 0, GGMLHEXAGON_TMPBUF_LEN);
+    ggmlhexagon_get_timestring(time_string);
+    GGMLHEXAGON_LOG_DEBUG("program running start time:%s", time_string);
+    std::string cfg_filename = std::string(g_hexagon_appcfg.runtimelib_path) + std::string(g_hexagon_appcfg.cfgfilename);
+    GGMLHEXAGON_LOG_INFO("load hexagon appcfg from %s", cfg_filename.c_str());
+    hexagon_appcfg qnncfg_instance;
+    qnncfg_instance.load(cfg_filename);
+    qnncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
+        std::ostringstream  tmposs;
+        tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]" << std::endl;
+        GGMLHEXAGON_LOG_INFO("%s", tmposs.str().c_str());
+    });
+    std::string precision_mode;
+    qnncfg_instance.get_intvalue("general", "print_qnn_internal_log", g_hexagon_appcfg.print_qnn_internal_log, 0);
+    qnncfg_instance.get_intvalue("general", "enable_perf", g_hexagon_appcfg.enable_perf, 0);
+    qnncfg_instance.get_intvalue("general", "print_tensors_info", g_hexagon_appcfg.print_tensors_info, 0);
+    qnncfg_instance.get_intvalue("general", "dump_op_info", g_hexagon_appcfg.dump_op_info, 0);
+    qnncfg_instance.get_intvalue("general", "hwaccel_approach", g_hexagon_appcfg.hwaccel_approach, 0);
+    qnncfg_instance.get_intvalue("general", "hexagon_backend", g_hexagon_appcfg.hexagon_backend, 2);
+    qnncfg_instance.get_intvalue("qnn", "hvx_threads", g_hexagon_appcfg.hvx_threads, 4);
+    qnncfg_instance.get_intvalue("qnn", "vtcm_size_in_mb", g_hexagon_appcfg.vtcm_size_in_mb, 8);
+    qnncfg_instance.get_intvalue("qnn", "enable_dlbc", g_hexagon_appcfg.enable_dlbc, 0);
+    qnncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32");
+    qnncfg_instance.get_intvalue("cdsp", "enable_mulmat_cdsp", g_hexagon_appcfg.enable_mulmat_cdsp, 0);
+    qnncfg_instance.get_intvalue("cdsp", "enable_q_mulmat", g_hexagon_appcfg.enable_q_mulmat, 0);
+    GGMLHEXAGON_LOG_INFO("print_qnn_internal_log=%d", g_hexagon_appcfg.print_qnn_internal_log);
+    GGMLHEXAGON_LOG_INFO("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
+                         ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
+    GGMLHEXAGON_LOG_INFO("hexagon_backend=%d", g_hexagon_appcfg.hexagon_backend);
+    GGMLHEXAGON_LOG_INFO("npu inference precision mode=%s", precision_mode.c_str());
+    GGMLHEXAGON_LOG_INFO("qnn runtime lib path=%s", g_hexagon_appcfg.runtimelib_path);
+    if (precision_mode.find("fp16") != std::string::npos) {
+        g_hexagon_appcfg.precision_mode = 1;
+    } else {
+        g_hexagon_appcfg.precision_mode = 0;
+    }
+    initialized = true;
+}
+
+// =================================================================================================
+//  section-5: QNN helper function
+// =================================================================================================
+//ensure every QNN tensor/opcfg name is unique, threadsafe is not required at the moment
+static void ggmlqnn_reset_idx() {
+    g_qnntensor_idx = 0;
+    g_qnnopcfg_idx = 0;
+}
+
+static void ggmlqnn_inc_idx(int idx_type) {
+    switch (idx_type) {
+        case QNN_TENSOR_INDEX:
+            g_qnntensor_idx++;
+            break;
+        case QNN_OPCFG_INDEX:
+            g_qnnopcfg_idx++;
+            break;
+        default:
+            break;
+    }
+}
+
+static int32_t ggmlqnn_get_idx(int idx_type) {
+    switch (idx_type) {
+        case QNN_TENSOR_INDEX:
+            return g_qnntensor_idx;
+        case QNN_OPCFG_INDEX:
+            return g_qnnopcfg_idx;
+        default:
+            break;
+    }
+}
+
+static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
+    return offset % alignment == 0 ? offset
+                                   : offset +
+                                     (static_cast<intptr_t>(alignment) -
+                                      offset % static_cast<intptr_t>(alignment));
+}
+
 static size_t ggmlqnn_memscpy(void * dst, size_t dst_size, const void * src, size_t copy_size) {
     if (!dst || !src || !dst_size || !copy_size)
         return 0;
@@ -1093,162 +1528,159 @@ static char * ggmlqnn_strndup(const char * source, size_t maxlen) {
 #endif
 }
 
-// =================================================================================================
-//  section-5: QNN helper function
-// =================================================================================================
-static inline uint32_t get_qnn_tensorid(const Qnn_Tensor_t & tensor) {
+static inline uint32_t ggmlqnn_get_tensorid(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.id;
     }
     return 0u;
 }
 
-static inline const char * get_qnn_tensorname(const Qnn_Tensor_t & tensor) {
+static inline const char * ggmlqnn_get_tensorname(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.name;
     }
     return nullptr;
 }
 
-static inline Qnn_TensorType_t get_qnn_tensortype(const Qnn_Tensor_t & tensor) {
+static inline Qnn_TensorType_t ggmlqnn_get_tensortype(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.type;
     }
     return QNN_TENSOR_TYPE_UNDEFINED;
 }
 
-static inline Qnn_TensorDataFormat_t get_qnn_tensor_dataformat(const Qnn_Tensor_t & tensor) {
+static inline Qnn_TensorDataFormat_t ggmlqnn_get_tensor_dataformat(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.dataFormat;
     }
     return QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER;
 }
 
-static inline Qnn_DataType_t get_qnn_tensor_datatype(const Qnn_Tensor_t & tensor) {
+static inline Qnn_DataType_t ggmlqnn_get_tensor_datatype(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.dataType;
     }
     return QNN_DATATYPE_UNDEFINED;
 }
 
-static inline Qnn_QuantizeParams_t get_qnn_tensor_quantparams(const Qnn_Tensor_t & tensor) {
+static inline Qnn_QuantizeParams_t ggmlqnn_get_tensor_quantparams(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.quantizeParams;
     }
     return QNN_QUANTIZE_PARAMS_INIT;
 }
 
-static inline uint32_t get_qnn_tensor_rank(const Qnn_Tensor_t & tensor) {
+static inline uint32_t ggmlqnn_get_tensor_rank(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.rank;
     }
     return 0u;
 }
 
-static inline uint32_t * get_qnn_tensor_dimensions(const Qnn_Tensor_t & tensor) {
+static inline uint32_t * ggmlqnn_get_tensor_dimensions(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.dimensions;
     }
     return nullptr;
 }
 
-static inline Qnn_TensorMemType_t get_qnn_tensor_memtype(const Qnn_Tensor_t & tensor) {
+static inline Qnn_TensorMemType_t ggmlqnn_get_tensor_memtype(const Qnn_Tensor_t & tensor) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         return tensor.v1.memType;
     }
     return QNN_TENSORMEMTYPE_UNDEFINED;
 }
 
-static inline void set_qnn_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) {
+static inline void ggmlqnn_set_tensor_id(Qnn_Tensor_t & tensor, uint32_t id) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.id = id;
     }
 }
 
-static inline void set_qnn_tensor_name(Qnn_Tensor_t & tensor, const char * name) {
+static inline void ggmlqnn_set_tensor_name(Qnn_Tensor_t & tensor, const char * name) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.name = name;
     }
 }
 
-static inline void set_qnn_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) {
+static inline void ggmlqnn_set_tensor_type(Qnn_Tensor_t & tensor, Qnn_TensorType_t type) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.type = type;
     }
 }
 
-static inline void set_qnn_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) {
+static inline void ggmlqnn_set_tensor_dataformat(Qnn_Tensor_t & tensor, Qnn_TensorDataFormat_t format) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.dataFormat = format;
     }
 }
 
-static inline void set_qnn_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) {
+static inline void ggmlqnn_set_tensor_datatype(Qnn_Tensor_t & tensor, Qnn_DataType_t dataType) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.dataType = dataType;
     }
 }
 
-static inline void set_qnn_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) {
+static inline void ggmlqnn_set_tensor_quantparams(Qnn_Tensor_t & tensor, Qnn_QuantizeParams_t params) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.quantizeParams = params;
     }
 }
 
-static inline void set_qnn_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) {
+static inline void ggmlqnn_set_tensor_rank(Qnn_Tensor_t & tensor, uint32_t rank) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.rank = rank;
     }
 }
 
-static inline void set_qnn_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) {
+static inline void ggmlqnn_set_tensor_dimensions(Qnn_Tensor_t & tensor, uint32_t * dims) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.dimensions = dims;
     }
 }
 
-static inline void set_qnn_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) {
+static inline void ggmlqnn_set_tensor_memtype(Qnn_Tensor_t & tensor, Qnn_TensorMemType_t memType) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.memType = memType;
     }
 }
 
-static inline void set_qnn_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) {
+static inline void ggmlqnn_set_tensor_clientbuf(Qnn_Tensor_t & tensor, Qnn_ClientBuffer_t clientBuf) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.clientBuf = clientBuf;
     }
 }
 
-static inline void set_qnn_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) {
+static inline void ggmlqnn_set_tensor_memhandle(Qnn_Tensor_t & tensor, Qnn_MemHandle_t handle) {
     if (tensor.version == QNN_TENSOR_VERSION_1) {
         tensor.v1.memHandle = handle;
     }
 }
 
-static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
+static int ggmlqnn_deep_copy_qnntensor(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
     int err = 0;
 
     dst.version = src.version;
-    QNN_TENSOR_SET_NAME(dst, ggmlqnn_strndup(QNN_TENSOR_GET_NAME(src), std::string(QNN_TENSOR_GET_NAME(src)).size()));
-    if (nullptr == QNN_TENSOR_GET_NAME(dst)) {
+    ggmlqnn_set_tensor_name(dst, ggmlqnn_strndup(ggmlqnn_get_tensorname(src), std::string(ggmlqnn_get_tensorname(src)).size()));
+    if (nullptr == ggmlqnn_get_tensorname(dst)) {
         return 1;
     }
-    QNN_TENSOR_SET_ID(dst, QNN_TENSOR_GET_ID(src));
-    QNN_TENSOR_SET_TYPE(dst, QNN_TENSOR_GET_TYPE(src));
-    QNN_TENSOR_SET_DATA_FORMAT(dst, QNN_TENSOR_GET_DATA_FORMAT(src));
-    QNN_TENSOR_SET_DATA_TYPE(dst, QNN_TENSOR_GET_DATA_TYPE(src));
-    QNN_TENSOR_SET_MEM_TYPE(dst, QNN_TENSOR_GET_MEM_TYPE(src));
+    ggmlqnn_set_tensor_id(dst, ggmlqnn_get_tensorid(src));
+    ggmlqnn_set_tensor_type(dst, ggmlqnn_get_tensortype(src));
+    ggmlqnn_set_tensor_dataformat(dst, ggmlqnn_get_tensor_dataformat(src));
+    ggmlqnn_set_tensor_datatype(dst, ggmlqnn_get_tensor_datatype(src));
+    ggmlqnn_set_tensor_memtype(dst, ggmlqnn_get_tensor_memtype(src));
 
-    if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_RAW) {
+    if (ggmlqnn_get_tensor_memtype(src) == QNN_TENSORMEMTYPE_RAW) {
         Qnn_ClientBuffer_t client_buf = {nullptr, 0};
-        QNN_TENSOR_SET_CLIENT_BUF(dst, client_buf);
-    } else if (QNN_TENSOR_GET_MEM_TYPE(src) == QNN_TENSORMEMTYPE_MEMHANDLE) {
-        QNN_TENSOR_SET_MEM_HANDLE(dst, nullptr);
+        ggmlqnn_set_tensor_clientbuf(dst, client_buf);
+    } else if (ggmlqnn_get_tensor_memtype(src) == QNN_TENSORMEMTYPE_MEMHANDLE) {
+        ggmlqnn_set_tensor_memhandle(dst, nullptr);
     } else {
         return 1;
     }
 
-    Qnn_QuantizeParams_t src_qparam      = QNN_TENSOR_GET_QUANT_PARAMS(src);
+    Qnn_QuantizeParams_t src_qparam      = ggmlqnn_get_tensor_quantparams(src);
     Qnn_QuantizationEncoding_t encoding  = src_qparam.quantizationEncoding;
     if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
         Qnn_QuantizeParams_t src_qparam_cpy       = src_qparam;
@@ -1260,7 +1692,7 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
                         scale_offset_size,
                         src_qparam.axisScaleOffsetEncoding.scaleOffset,
                         scale_offset_size);
-        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy);
+        ggmlqnn_set_tensor_quantparams(dst, src_qparam_cpy);
     } else if (encoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET) {
         Qnn_QuantizeParams_t src_qparam_cpy           = src_qparam;
         Qnn_BwAxisScaleOffset_t & bwaxis_scale_offset = src_qparam_cpy.bwAxisScaleOffsetEncoding;
@@ -1275,29 +1707,29 @@ static int deep_copy_qnn_tensors(Qnn_Tensor_t & src, Qnn_Tensor_t & dst) {
             *offsets           = (int32_t *)malloc(offset_size);
             ggmlqnn_memscpy(*offsets, offset_size, src_qparam.bwAxisScaleOffsetEncoding.offsets, offset_size);
         }
-        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam_cpy);
+        ggmlqnn_set_tensor_quantparams(dst, src_qparam_cpy);
     } else {
-        QNN_TENSOR_SET_QUANT_PARAMS(dst, src_qparam);
+        ggmlqnn_set_tensor_quantparams(dst, src_qparam);
     }
 
-    uint32_t rank = QNN_TENSOR_GET_RANK(src);
-    QNN_TENSOR_SET_RANK(dst, rank);
+    uint32_t rank = ggmlqnn_get_tensor_rank(src);
+    ggmlqnn_set_tensor_rank(dst, rank);
     size_t dim_size       = GGML_MAX_DIMS * sizeof(uint32_t);
     uint32_t * dimensions = (uint32_t *)malloc(dim_size);
     if (nullptr == dimensions) {
-        GGMLQNN_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", QNN_TENSOR_GET_NAME(src));
+        GGMLHEXAGON_LOG_WARN("deep_copy_qnn_tensors() allocation error while copying tensor %s\n", ggmlqnn_get_tensorname(src));
         return 1;
     }
-    ggmlqnn_memscpy(dimensions, dim_size, QNN_TENSOR_GET_DIMENSIONS(src), dim_size);
-    QNN_TENSOR_SET_DIMENSIONS(dst, dimensions);
+    ggmlqnn_memscpy(dimensions, dim_size, ggmlqnn_get_tensor_dimensions(src), dim_size);
+    ggmlqnn_set_tensor_dimensions(dst, dimensions);
 
     return err;
 }
 
-static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
+static int ggmlqnn_free_qnntensor(Qnn_Tensor_t * tensor) {
     int err = 0;
-    free((void *) QNN_TENSOR_GET_NAME(*tensor));
-    Qnn_QuantizeParams_t src_qparam     = QNN_TENSOR_GET_QUANT_PARAMS(*tensor);
+    free((void *) ggmlqnn_get_tensorname(*tensor));
+    Qnn_QuantizeParams_t src_qparam     = ggmlqnn_get_tensor_quantparams(*tensor);
     Qnn_QuantizationEncoding_t encoding = src_qparam.quantizationEncoding;
     if (encoding == QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET) {
         free(src_qparam.axisScaleOffsetEncoding.scaleOffset);
@@ -1307,7 +1739,7 @@ static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
             free(src_qparam.bwAxisScaleOffsetEncoding.offsets);
         }
     }
-    free(QNN_TENSOR_GET_DIMENSIONS(*tensor));
+    free(ggmlqnn_get_tensor_dimensions(*tensor));
     free(tensor);
 
     return err;
@@ -1416,4542 +1848,4037 @@ static const char * ggmlqnn_get_qnnerror_string(Qnn_ErrorHandle_t qnn_error_code
     }
 }
 
-// =================================================================================================
-//  section-6: Hexagon DSP helper function
-// =================================================================================================
-static const char * ggmlhexagon_get_dsp_name(int domain_id) {
-    switch (domain_id) {
-        case HEXAGON_ADSP:
-            return "Hexagon-aDSP";
-        case HEXAGON_MDSP:
-            return "Hexagon-mDSP";
-        case HEXAGON_SDSP:
-            return "Hexagon-sDSP";
-        case HEXAGON_CDSP:
-            return "Hexagon-cDSP";
-        case HEXAGON_CDSP1:
-            return "Hexagon-cDSP1";
+// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
+static Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
+    switch (ggmltype) {
+        case GGML_TYPE_F16:
+            return QNN_DATATYPE_FLOAT_16;
+        case GGML_TYPE_F32:
+            return QNN_DATATYPE_FLOAT_32;
+        case GGML_TYPE_I8:
+            return QNN_DATATYPE_INT_8;
+        case GGML_TYPE_Q8_0:
+            return QNN_DATATYPE_SFIXED_POINT_8;
+        case GGML_TYPE_Q4_0:
+            return QNN_DATATYPE_SFIXED_POINT_4;
         default:
-            return "Hexagon-unknown";
-    }
-}
-
-static int ggmlhexagon_pd_status_notifier_callback(void * context, int domain, int session, remote_rpc_status_flags_t status){
-    int error = AEE_SUCCESS;
-    switch (status){
-        case  FASTRPC_USER_PD_UP:
-            GGMLQNN_LOG_DEBUG("PD is up\n");
-            break;
-        case  FASTRPC_USER_PD_EXIT:
-            GGMLQNN_LOG_DEBUG("PD closed\n");
-            break;
-        case  FASTRPC_USER_PD_FORCE_KILL:
-            GGMLQNN_LOG_DEBUG("PD force kill\n");
-            break;
-        case  FASTRPC_USER_PD_EXCEPTION:
-            GGMLQNN_LOG_DEBUG("PD exception\n");
-            break;
-        case  FASTRPC_DSP_SSR:
-            GGMLQNN_LOG_DEBUG("DSP SSR\n");
-            break;
-        default :
-            error =  AEE_EBADITEM;
             break;
     }
-    return error;
+    return QNN_DATATYPE_UNDEFINED;
 }
 
-static domain * ggmlhexagon_get_domain(int domain_id) {
-    int size = sizeof(hexagon_supported_domains) / sizeof(domain);
-
-    for (size_t i = 0; i < size; i++) {
-        if (hexagon_supported_domains[i].id == domain_id)
-            return &hexagon_supported_domains[i];
+static void ggmlqnn_get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, const uint32_t * ggml_dimensions, uint32_t rank) {
+    if (rank > GGML_MAX_DIMS) {
+        GGMLHEXAGON_LOG_WARN("invalid params");
+        return;
+    }
+    if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) {
+        GGMLHEXAGON_LOG_WARN("invalid params");
+        return;
     }
+    for (size_t idx = 0; idx < GGML_MAX_DIMS; idx++)
+        qnn_dimensions[idx] = ggml_dimensions[idx];
 
-    return nullptr;
+    if (rank >= 2) {
+        qnn_dimensions[rank - 1] = ggml_dimensions[rank - 2];
+        qnn_dimensions[rank - 2] = ggml_dimensions[rank - 1];
+    }
 }
 
-static bool ggmlhexagon_is_cdsp(int domain_id) {
-    return (domain_id == HEXAGON_CDSP) || (domain_id == HEXAGON_CDSP1);
+template<typename Fn>
+Fn ggmlqnn_load_qnn_functionpointers(void * handle, const char * function_name) {
+    return reinterpret_cast<Fn>(dlsym(handle, function_name));
 }
 
-static bool ggmlhexagon_is_valid_domain_id(int domain_id, int compute_only) {
-    int size = sizeof(hexagon_supported_domains) / sizeof(domain);
+class qnn_interface {
+#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name)           \
+  template <typename... Args>                                     \
+  inline auto qnn_##F(Args... args) const {                       \
+    return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \
+        std::forward<Args>(args)...);                             \
+  }
 
-    if (compute_only) {
-        return ggmlhexagon_is_cdsp(domain_id);
-    }
 
-    for (size_t i = 0; i < size; i++) {
-        if (hexagon_supported_domains[i].id == domain_id)
-            return true;
-    }
+#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name)                  \
+  template <typename... Args>                                                \
+  inline auto qnn_##F(Args... args) const {                                  \
+    return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \
+        std::forward<Args>(args)...);                                        \
+  }
 
-    return false;
-}
+    friend class qnn_instance;
 
-static int ggmlhexagon_get_domains_info(const char * domain_type, int * num_domains, fastrpc_domain ** domains_info) {
-    int hexagon_err = AEE_SUCCESS;
-    int ss_info     = 0;
-    ss_info = strcmp(domain_type, "NSP")? HPASS: NSP;
-    system_req_payload req;
-    memset(&req, 0, sizeof(system_req_payload));
-    req.id = FASTRPC_GET_DOMAINS;
-    req.sys.domains = nullptr;
-    fastrpc_domain * domain = nullptr;
+public:
+    qnn_interface() = default;
 
-    if (ss_info != 0) {
-        req.sys.flags = DOMAINS_LIST_FLAGS_SET_TYPE(req.sys.flags, ss_info);
-    } else {
-        req.sys.flags =0;
-    }
+    // QnnBackend
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate)
 
-#ifdef _WIN32
-    hexagon_err = AEE_EUNSUPPORTED;
-    goto bail;
-#endif
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree)
 
-    if (remote_system_request) {
-        hexagon_err = remote_system_request(&req);
-        if (hexagon_err != AEE_SUCCESS) {
-            GGMLQNN_LOG_DEBUG("failure in remote_system_request call: %d", hexagon_err);
-            goto bail;
-        }
-        //allocate memory for domain-info array
-        req.sys.max_domains = req.sys.num_domains;
-        void * buffer = calloc(req.sys.num_domains, sizeof(fastrpc_domain));
-        if (nullptr == buffer) {
-            hexagon_err = AEE_ENOMEMORY;
-            GGMLQNN_LOG_DEBUG("unable to allocate memory for req.sys.domains");
-            goto bail;
-        }
-        req.sys.domains = static_cast<fastrpc_domain *>(buffer);
-        hexagon_err = remote_system_request(&req);
-        if (hexagon_err != AEE_SUCCESS) {
-            GGMLQNN_LOG_DEBUG("failure in remote_system_request call: %d.\n", hexagon_err);
-            goto bail;
-        }
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage)
 
-        for (int i = 0; i < req.sys.num_domains; i++) {
-            //verify that only requested type domains were returned
-            domain = &req.sys.domains[i];
-            if (domain->type != ss_info) {
-                hexagon_err = -1;
-                GGMLQNN_LOG_DEBUG("incorrect data received from remote_system_request.\n");
-                goto bail;
-            }
-        }
-        *domains_info = req.sys.domains;
-        *num_domains  = req.sys.num_domains;
-    } else {
-        hexagon_err = AEE_EUNSUPPORTED;
-        goto bail;
-    }
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig)
 
-bail:
-    if (hexagon_err && !req.sys.domains) {
-        free(req.sys.domains);
-    }
-    return hexagon_err;
-}
+    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion)
 
-static int ggmlhexagon_get_dsp_support(int * domain) {
-    int hexagon_error = AEE_SUCCESS;
-    *domain = HEXAGON_CDSP;
+    // QnnDevice
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate)
 
-    if (remote_handle_control) {
-        struct remote_dsp_capability dsp_capability_domain = {HEXAGON_CDSP, DOMAIN_SUPPORT, 0};
-        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
-        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-            GGMLQNN_LOG_DEBUG("FastRPC Capability API is not supported on this device");
-            goto bail;
-        }
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree)
 
-        if (0 == dsp_capability_domain.capability) {
-            dsp_capability_domain.domain       = HEXAGON_ADSP;
-            dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT;
-            dsp_capability_domain.capability   = 0;
-            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
-            if(dsp_capability_domain.capability) {
-                *domain = HEXAGON_ADSP;
-            }
-        }
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure)
 
-        if (hexagon_error != AEE_SUCCESS) {
-            GGMLQNN_LOG_DEBUG("get_dsp_support failed with error 0x%x", hexagon_error);
-            goto bail;
-        }
-    } else {
-        hexagon_error = AEE_EUNSUPPORTEDAPI;
-        GGMLQNN_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
-    }
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo)
 
-bail:
-    return hexagon_error;
-}
+    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo)
 
-static int ggmlhexagon_get_vtcm_info(int domain, uint32_t attr, uint32_t * capability) {
-    int hexagon_error = AEE_SUCCESS;
-    *capability = 0;
+    // QnnContext
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate)
 
-    if (attr == VTCM_PAGE || attr == VTCM_COUNT) {
-    } else {
-        hexagon_error = AEE_EBADPARM;
-        GGMLQNN_LOG_DEBUG("unsupported attr, only VTCM_PAGE and VTCM_COUNT supported");
-        goto bail;
-    }
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize)
 
-    if (remote_handle_control) {
-        if (domain == HEXAGON_ADSP || domain == HEXAGON_CDSP) {
-            /*
-            * query the DSP for VTCM information
-            * since the ADSP does not have a dedicated VTCM, we expect the output to be 0
-            */
-            struct remote_dsp_capability dsp_capability_vtcm_dsp;
-            dsp_capability_vtcm_dsp.domain       = (uint32_t)domain;
-            dsp_capability_vtcm_dsp.attribute_ID = attr;
-            dsp_capability_vtcm_dsp.capability   = (uint32_t)0;
-            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp, sizeof(struct remote_dsp_capability));
-            if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-                GGMLQNN_LOG_DEBUG("FastRPC Capability API is not supported on this device");
-                GGMLQNN_LOG_DEBUG("running the use case without checking the capability");
-                hexagon_error = AEE_SUCCESS;
-                goto bail;
-            } else if (hexagon_error == AEE_SUCCESS) {
-                *capability = dsp_capability_vtcm_dsp.capability;
-            } else {
-                GGMLQNN_LOG_DEBUG("get_vtcm_info failed with error 0x%x", hexagon_error);
-                goto bail;
-            }
-        } else {
-            hexagon_error = AEE_EUNSUPPORTED;
-            GGMLQNN_LOG_DEBUG("unsupported domain %d", domain);
-            goto bail;
-        }
-    } else {
-        hexagon_error = AEE_EUNSUPPORTEDAPI;
-        GGMLQNN_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
-    }
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary)
 
-bail:
-    return hexagon_error;
-}
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary)
 
-static bool ggmlhexagon_is_unsignedpd_supported(int domain_id) {
-    int hexagon_error = AEE_SUCCESS;
-    if (remote_handle_control) {
-        struct remote_dsp_capability dsp_capability_domain = {static_cast<uint32_t>(domain_id), UNSIGNED_PD_SUPPORT, 0};
-        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
-        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-            GGMLQNN_LOG_WARN("FastRPC Capability API is not supported on this device. Falling back to signed pd");
-            return false;
-        }
+    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree)
 
-        if (hexagon_error) {
-            GGMLQNN_LOG_WARN("error 0x%x: FastRPC Capability API failed. falling back to signed pd", hexagon_error);
-            return false;
-        }
+    // QnnGraph
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate)
 
-        if (dsp_capability_domain.capability == 1) {
-            return true;
-        }
-    } else {
-        hexagon_error = AEE_EUNSUPPORTEDAPI;
-        GGMLQNN_LOG_WARN("remote_dsp_capability interface is not supported on this device.falling back to signed pd");
-        return false;
-    }
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode)
 
-    return false;
-}
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize)
 
-static bool ggmlhexagon_get_unsignedpd_support(void) {
-    return ggmlhexagon_is_unsignedpd_supported(HEXAGON_CDSP);
-}
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute)
 
-static bool ggmlhexagon_is_async_fastrpc_supported(int domain) {
-    int hexagon_error = AEE_SUCCESS;
-    if (remote_handle_control) {
-        if (domain == HEXAGON_CDSP) {
-            /*
-            * Query the DSP for ASYNC_FASTRPC_SUPPORT information
-            * Async fastrpc is supported only on CDSP
-            */
-            struct remote_dsp_capability dsp_capability_async_support;
-            dsp_capability_async_support.domain       = (uint32_t)domain;
-            dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT;
-            dsp_capability_async_support.capability   = (uint32_t)0;
-            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support, sizeof(struct remote_dsp_capability));
-            if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-                GGMLQNN_LOG_WARN("FastRPC Capability API is not supported on this device");
-                hexagon_error = AEE_SUCCESS;
-                goto bail;
-            } else if (dsp_capability_async_support.capability == 1) {
-                return true;
-            }
+    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve)
 
-            if (hexagon_error != AEE_SUCCESS){
-                GGMLQNN_LOG_WARN("failed with error 0x%x", hexagon_error);
-                goto bail;
-            }
-        } else {
-            hexagon_error = AEE_EUNSUPPORTED;
-            GGMLQNN_LOG_WARN("async FastRPC is not supported on domain %d", domain);
-            goto bail;
-        }
-    } else {
-        hexagon_error = AEE_EUNSUPPORTEDAPI;
-        GGMLQNN_LOG_WARN("remote_dsp_capability interface is not supported on this device");
-    }
+    // QnnLog
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate)
 
-bail:
-    return false;
-}
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree)
 
-static void ggmlhexagon_set_rpc_latency(int domain, int qos, int latency) {
-    int hexagon_error = AEE_SUCCESS;
+    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel)
 
-    if (remote_handle_control) {
-        struct remote_rpc_control_latency data;
-/*
-        qos          |  latency
-        -----------------------
-        RPC_PM_QOS   |  300
-        RPC_POLL_QOS |  1000
-*/
-        data.enable   = qos;
-        data.latency  = latency;
-        hexagon_error = remote_handle64_control(DSPRPC_GET_DSP_INFO, DSPRPC_CONTROL_LATENCY, (void*)&data, sizeof(data));
-        if (hexagon_error != AEE_SUCCESS) {
-            //FIXME: why set rpc latency failure
-            GGMLQNN_LOG_WARN("failed with error 0x%x", hexagon_error);
-            goto bail;
-        } else {
-            GGMLQNN_LOG_INFO("set rpc qos %d, latency %d\n", qos, latency);
-        }
-    } else {
-        hexagon_error = AEE_EUNSUPPORTEDAPI;
-        GGMLQNN_LOG_WARN("remote_dsp_capability interface is not supported on this device");
-    }
+    // QnnProfile
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate)
 
-bail:
-    return;
-}
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents)
 
-static bool ggmlhexagon_is_status_notification_supported(int domain) {
-    int hexagon_error = AEE_SUCCESS;
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents)
 
-    if (remote_handle_control) {
-        /*
-        * Query the DSP for STATUS_NOTIFICATION_SUPPORT information
-        * DSP User PD status notification Support
-        */
-        struct remote_dsp_capability dsp_capability_status_notification_support;
-        dsp_capability_status_notification_support.domain       = (uint32_t)domain;
-        dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT;
-        dsp_capability_status_notification_support.capability   = (uint32_t)0;
-        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support, sizeof(struct remote_dsp_capability));
-        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-            GGMLQNN_LOG_WARN("FastRPC Capability API is not supported on this device");
-            hexagon_error = AEE_SUCCESS;
-            goto bail;
-        } else if (1 == dsp_capability_status_notification_support.capability) {
-            return true;
-        }
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData)
 
-        if (hexagon_error != AEE_SUCCESS){
-            GGMLQNN_LOG_WARN("failed with error 0x%x", hexagon_error);
-            goto bail;
-        }
-    } else {
-        hexagon_error = AEE_EUNSUPPORTEDAPI;
-        GGMLQNN_LOG_WARN("remote_dsp_capability interface is not supported on this device");
-    }
+    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree)
 
-bail:
-    return false;
-}
+    // QnnMem
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister)
 
-static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t attr, uint32_t * capability) {
-    int hexagon_error = AEE_SUCCESS;
-    *capability = 0;
+    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister)
 
-    if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) {
-        hexagon_error = AEE_EBADPARM;
-        GGMLQNN_LOG_WARN("unsupported attr, only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported");
-        goto bail;
-    }
+    // QnnProperty
+    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability)
 
-    if (remote_handle_control) {
-        if (domain == HEXAGON_CDSP) {
-            /*
-            * Query the DSP for HMX SUPPORT information
-            * HMX is supported on CDSP only
-            */
-            struct remote_dsp_capability dsp_capability_hmx_dsp;
-            dsp_capability_hmx_dsp.domain       = (uint32_t)domain;
-            dsp_capability_hmx_dsp.attribute_ID = attr;
-            dsp_capability_hmx_dsp.capability   = (uint32_t)0;
-            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp, sizeof(struct remote_dsp_capability));
-            if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-                GGMLQNN_LOG_DEBUG("FastRPC Capability API is not supported on this device");
-                hexagon_error = AEE_SUCCESS;
-                goto bail;
-            }
-            else if (hexagon_error == AEE_SUCCESS) {
-                *capability = dsp_capability_hmx_dsp.capability;
-            } else {
-                GGMLQNN_LOG_DEBUG("get_hmx_support_info failed with Error 0x%x", hexagon_error);
-                goto bail;
-            }
-        } else {
-            hexagon_error = AEE_EUNSUPPORTED;
-            GGMLQNN_LOG_DEBUG("HMX support is not there for domain %d", domain);
-            goto bail;
-        }
-    } else {
-        hexagon_error = AEE_EUNSUPPORTEDAPI;
-        GGMLQNN_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
-    }
+    // QnnTensor
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor)
 
-bail:
-    return hexagon_error;
-}
+    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor)
 
-static int ggmlhexagon_get_hvx_arch_ver(int domain, uint32_t * capability) {
-    int hexagon_error = AEE_SUCCESS;
-    *capability = 0;
-    if(remote_handle_control) {
-        /*
-        * Query the Hexagon processor architecture version information
-        */
-        struct remote_dsp_capability dsp_capability_arch_ver;
-        dsp_capability_arch_ver.domain       = (uint32_t)domain;
-        dsp_capability_arch_ver.attribute_ID = ARCH_VER;
-        dsp_capability_arch_ver.capability   = (uint32_t)0;
-        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_arch_ver, sizeof(struct remote_dsp_capability));
-        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
-            GGMLQNN_LOG_DEBUG("FastRPC Capability API is not supported on this device");
-            hexagon_error = AEE_SUCCESS;
-            goto bail;
-        } else if (hexagon_error == AEE_SUCCESS) {
-            *capability = dsp_capability_arch_ver.capability & 0xFF;
-        } else {
-            GGMLQNN_LOG_DEBUG("get_hex_arch_ver failed with error 0x%x", hexagon_error);
-            goto bail;
-        }
-    } else {
-        hexagon_error = AEE_EUNSUPPORTEDAPI;
-        GGMLQNN_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
+    // QnnSystem
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate)
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo)
+
+    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree)
+
+    void set_qnn_interface(const QnnInterface_t * qnn_interface) {
+        _qnn_interface = qnn_interface;
     }
 
-bail:
-    return hexagon_error;
-}
+    void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) {
+        _qnn_sys_interface = qnn_sys_interface;
+    }
 
-static int ggmlhexagon_get_hvx_support_info(int domain, uint32_t attr, uint32_t * capability)
-{
-    int hexagon_error = AEE_SUCCESS;
-    *capability = 0;
-    if (attr == HVX_SUPPORT_64B) {
-        hexagon_error = AEE_EBADPARM;
-        GGMLQNN_LOG_DEBUG("latest targets have 128 byte HVX register, use HVX_SUPPORT_128B instead of HVX_SUPPORT_64B");
-        goto bail;
+    uint32_t get_backend_id() const {
+        return _qnn_interface->backendId;
     }
 
-    if (attr != HVX_SUPPORT_128B) {
-        hexagon_error = AEE_EBADPARM;
-        GGMLQNN_LOG_DEBUG("unsupported attr. only HVX_SUPPORT_128B supported");
-        goto bail;
+    bool is_loaded() const {
+        return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr));
     }
 
-    if (remote_handle_control) {
-        if (domain == HEXAGON_CDSP) {
-            /*
-            * Query the DSP for HVX SUPPORT information
-            * HVX is supported on CDSP only
-            */
-            struct remote_dsp_capability dsp_capability_hvx_dsp;
-            dsp_capability_hvx_dsp.domain       = (uint32_t)domain;
-            dsp_capability_hvx_dsp.attribute_ID = attr;
-            dsp_capability_hvx_dsp.capability   = (uint32_t)0;
-            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp, sizeof(struct remote_dsp_capability));
-            if ((hexagon_error & 0xFF)==(AEE_EUNSUPPORTEDAPI & 0xFF)) {
-                GGMLQNN_LOG_DEBUG("FastRPC Capability API is not supported on this device");
-                hexagon_error = AEE_SUCCESS;
-                goto bail;
-            } else if (hexagon_error == AEE_SUCCESS) {
-                *capability = dsp_capability_hvx_dsp.capability;
-            } else {
-                GGMLQNN_LOG_DEBUG("failed with error 0x%x", hexagon_error);
-                goto bail;
-            }
-        } else {
-            hexagon_error = AEE_EUNSUPPORTED;
-            GGMLQNN_LOG_DEBUG("HVX support is not available on domain %d", domain);
-            goto bail;
-        }
-    } else {
-        hexagon_error = AEE_EUNSUPPORTEDAPI;
-        GGMLQNN_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
+private:
+    const QnnInterface_t * _qnn_interface           = nullptr;
+
+    const QnnSystemInterface_t * _qnn_sys_interface = nullptr;
+};
+
+class qnn_instance {
+public:
+    using BackendIdType = decltype(QnnInterface_t{}.backendId);
+
+    explicit qnn_instance(const std::string & lib_path, const std::string & backend_name,
+                          const std::string & model_name) :
+            _lib_path(std::move(lib_path)),
+            _backend_name(std::move(backend_name)),
+            _model_name(std::move(model_name)) {}
+
+    ~qnn_instance() {
     }
 
-bail:
-    return hexagon_error;
-}
+    int qnn_init(const QnnSaver_Config_t ** saver_config);
 
-static int ggmlhexagon_request_status_notifications(int domain_id, void * context, notify_callback_fn call_back_fn) {
-    int hexagon_error = AEE_SUCCESS;
-    struct remote_rpc_notif_register notif;
-    bool status_notification_support;
+    int qnn_finalize();
 
-    notif.context     = context;
-    notif.domain      = domain_id;
-    notif.notifier_fn = call_back_fn;
+    const qnn_interface & get_qnn_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLHEXAGON_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        }
+        return _qnn_interface;
+    }
 
-    status_notification_support = ggmlhexagon_is_status_notification_supported(domain_id);
-    if (status_notification_support) {
-        hexagon_error = remote_session_control(FASTRPC_REGISTER_STATUS_NOTIFICATIONS, (void*)&notif, sizeof(notif));
-        if (hexagon_error != AEE_SUCCESS) {
-            GGMLQNN_LOG_DEBUG("error 0x%x: remote_session_control failed to enable status notifications", hexagon_error);
+    const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLHEXAGON_LOG_WARN("pls check why _qnn_interface is not loaded\n");
         }
-    } else {
-        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        return _qnn_raw_interface;
     }
 
-    return hexagon_error;
-}
-
-static void ggmlhexagon_probe_dspinfo(ggml_backend_qnn_context * ctx, size_t * rpcmem_capacity) {
-    size_t candidate_size   = 0;
-    uint8_t * rpc_buffer    = nullptr;
-    const int SIZE_IN_MB    = (1 << 20);
-    size_t probe_slots[]    = {1024, 1536, 2048 - 48, 2048};
-    size_t probe_counts     = sizeof(probe_slots) / sizeof(size_t);
-    for (size_t idx = 0; idx < probe_counts; idx++) {
-        rpc_buffer = static_cast<uint8_t *>(rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB)));
-        if (nullptr == rpc_buffer) {
-            GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
-            break;
-        } else {
-            candidate_size = probe_slots[idx];
-            rpcmem_free(rpc_buffer);
-            rpc_buffer = nullptr;
+    const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() {
+        if (!_qnn_interface.is_loaded()) {
+            GGMLHEXAGON_LOG_WARN("pls check why _qnn_interface is not loaded\n");
         }
+        return _qnn_raw_system_interface;
     }
 
-    *rpcmem_capacity = candidate_size;
-    GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", *rpcmem_capacity);
+    Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
 
-    uint32_t dsp_version = 0;
-    ggmlhexagon_get_hvx_arch_ver(ctx->domain_id, &dsp_version);
+    Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
 
-    if (dsp_version == 0x68 || dsp_version == 0x69 || dsp_version == 0x73 || dsp_version == 0x75 || dsp_version == 0x79) {
-        GGMLQNN_LOG_DEBUG("dsp arch version 0x%x", dsp_version);
-    } else {
-        GGMLQNN_LOG_WARN("error: dsp arch version 0x%x is not supported", dsp_version);
-    }
+    Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
 
-    uint32_t vtcm_count = 0;
-    uint32_t vtcm_page  = 0;
-    ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_COUNT, &vtcm_count);
-    ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_PAGE, &vtcm_page);
-    GGMLQNN_LOG_DEBUG("vtcm_count %d", vtcm_count);
-    GGMLQNN_LOG_DEBUG("vtcm_page %d", vtcm_page);
+    Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
 
-    uint32_t hmx_depth = 0;
-    uint32_t hmx_spatial = 0;
-    ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_DEPTH, &hmx_depth);
-    ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_SPATIAL, &hmx_spatial);
-    GGMLQNN_LOG_DEBUG("hmx_depth %d", hmx_depth);
-    GGMLQNN_LOG_DEBUG("hmx_spatial %d", hmx_spatial);
+    Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
 
-    uint32_t hvx_support_128b = 0;
-    ggmlhexagon_get_hvx_support_info(ctx->domain_id, HVX_SUPPORT_128B, &hvx_support_128b);
-    GGMLQNN_LOG_DEBUG("hvx_support_128b %d", hvx_support_128b);
+    QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
 
-    GGMLQNN_LOG_DEBUG("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support());
-    GGMLQNN_LOG_DEBUG("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id));
-}
+    Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
 
-static int ggmlhexagon_init_dsp(ggml_backend_qnn_context * ctx) {
-    int hexagon_error               = AEE_SUCCESS;
+    int init_qnn_graph(const char * graph_name,
+                       bool debug,
+                       uint8_t do_node_validation = 1,
+                       const QnnGraph_Config_t ** graph_configs = nullptr
+    );
+    int init_qnn_graph(const std::string & graph_name, HEXAGONBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
 
-    int domain_id                   = HEXAGON_CDSP;
-    const char * domain_type        = "NSP";
+    int finalize_qnn_graph();
 
-    int unsignedpd_flag             = 1;
-    bool is_unsignedpd_enabled      = false;
-    int use_logical_id              = 0;
-    int core_id                     = -1;
-    fastrpc_domain * domains_info   = NULL;
-    fastrpc_domain * domain_info    = NULL;
-    int num_domains                 = -1;
+    bool is_valid_graph() const { return _qnn_graph_handle != nullptr; }
 
-    domain * my_domain              = NULL;
-    char * uri                      = NULL;
+    int htp_init_perfinfra();
 
-    char * ggmlop_domain_uri        = NULL;
-    int    ggmlop_domain_uri_len    = 0;
+    int htp_set_rpc_polling();
 
-    if (nullptr == ctx)
-        return 1;
-    GGMLQNN_LOG_INFO("init Hexagon DSP with backend %d(%s)", ctx->device, ggml_backend_qnn_get_devname(ctx->device));
-    //TODO: reasonable rpc memory pool size and use it practically
-    ctx->ggmlop_handle = -1;
-    ctx->rpc_mempool_len  = (1 << 20) * 512;
-    ctx->rpc_mempool = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, ctx->rpc_mempool_len);
-    if (nullptr == ctx->rpc_mempool) {
-        hexagon_error = AEE_ENORPCMEMORY;
-        printf("rpc memory alloc failed", hexagon_error);
-        ctx->rpc_mempool_len = 0;
-        return 2;
+    int htp_set_high_performance_mode();
+
+    std::string & get_qnn_graph_name() { return _graph_name; }
+
+    bool is_rpcmem_initialized() {
+        return _rpcmem_initialized;
     }
 
-    if (-1 == domain_id) {
-        if (nullptr != domain_type) {
-            if ((strcmp(domain_type, "NSP") != 0 && strcmp(domain_type, "HPASS") != 0)) {
-                GGMLQNN_LOG_WARN("invalid domain_type %s. possible values are NSP or HPASS", domain_type);
-                goto bail;
-            } else {
-                hexagon_error = ggmlhexagon_get_domains_info(domain_type, &num_domains, &domains_info);
-                if (hexagon_error == AEE_EUNSUPPORTED) {
-                    GGMLQNN_LOG_DEBUG("API is not supported on this target so cannot get domains info from the device. falling back to legacy approach of using default domain id");
-                    hexagon_error = ggmlhexagon_get_dsp_support(&domain_id);
-                    if (hexagon_error != AEE_SUCCESS) {
-                        GGMLQNN_LOG_DEBUG("error: 0x%x, defaulting to CDSP domain", hexagon_error);
-                    }
-                } else if (hexagon_error != AEE_SUCCESS) {
-                    GGMLQNN_LOG_DEBUG("error in getting domains information");
-                    goto bail;
-                } else {
-                    if (core_id != -1) {
-                        if (core_id < 0 || core_id >= num_domains) {
-                            GGMLQNN_LOG_DEBUG("invalid core_id = %d for %s. core_id should be between 0 to %d", core_id, domain_type, num_domains - 1);
-                            hexagon_error = AEE_EBADPARM;
-                            goto bail;
-                        }
-                    } else {
-                        core_id = 0;
-                    }
-                    use_logical_id = 1;
-                    domain_id = domains_info[core_id].id;
-                }
-            }
-        } else {
-            GGMLQNN_LOG_DEBUG("DSP domain is not provided, retrieving DSP information using Remote APIs");
-            hexagon_error = ggmlhexagon_get_dsp_support(&domain_id);
-            if (hexagon_error != AEE_SUCCESS) {
-                GGMLQNN_LOG_DEBUG("error: 0x%x, defaulting to CDSP domain", hexagon_error);
-            }
-        }
+    void set_rpcmem_initialized(bool initialized) {
+        _rpcmem_initialized = initialized;
     }
 
-    if (0 == use_logical_id) {
-        if (!ggmlhexagon_is_valid_domain_id(domain_id, 0)) {
-            hexagon_error = AEE_EBADPARM;
-            GGMLQNN_LOG_DEBUG("error 0x%x: invalid domain %d", hexagon_error, domain_id);
-            goto bail;
-        }
+    size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
+    size_t get_rpcmem_usage() { return _rpcmem_usage; }
 
-        my_domain = ggmlhexagon_get_domain(domain_id);
-        if (nullptr == my_domain) {
-            GGMLQNN_LOG_DEBUG("unable to get domain struct %d",  domain_id);
-            goto bail;
-        }
-        uri = my_domain->uri;
-    }
-    GGMLQNN_LOG_INFO("domain uri=%s\n", uri);
+    int32_t rpcmem_to_fd(void * buf);
 
-    if (1 == unsignedpd_flag) {
-        is_unsignedpd_enabled = ggmlhexagon_is_unsignedpd_supported(domain_id);
-        if (!is_unsignedpd_enabled) {
-            GGMLQNN_LOG_DEBUG("overriding user request for unsigned PD, only signed offload is allowed on domain %d", domain_id);
-            unsignedpd_flag = 0;
-        }
-    }
+    int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor);
+    Qnn_MemHandle_t  register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type);
 
-    ctx->domain_id = domain_id;
-    GGMLQNN_LOG_INFO("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
-    GGMLQNN_LOG_INFO("unsignedpd_enabled %d", is_unsignedpd_enabled);
-    if (is_unsignedpd_enabled) {
-        if (remote_session_control) {
-            struct remote_rpc_control_unsigned_module data;
-            data.enable = 1;
-            data.domain = domain_id;
-            hexagon_error = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *)&data, sizeof(data));
-            GGMLQNN_LOG_DEBUG("remote_session_control returned %d for configuring unsigned PD success", hexagon_error);
-            if (AEE_SUCCESS != hexagon_error) {
-                GGMLQNN_LOG_DEBUG("error 0x%x: remote_session_control failed", hexagon_error);
-            }
-        } else {
-            GGMLQNN_LOG_DEBUG("unsigned PD not supported on this device");
-            hexagon_error = AEE_EUNSUPPORTED;
-            GGMLQNN_LOG_DEBUG("error 0x%x: remote_session_control interface is not supported on this device", hexagon_error);
-        }
-    }
+    void unregister_rpcmem();
+    void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
 
-    hexagon_error = ggmlhexagon_request_status_notifications(domain_id, (void *)STATUS_CONTEXT, ggmlhexagon_pd_status_notifier_callback);
-    if (AEE_SUCCESS != hexagon_error) {
-        if (AEE_EUNSUPPORTEDAPI != hexagon_error) {
-            GGMLQNN_LOG_WARN("error 0x%x: hexagon_request_status_notifications failed", hexagon_error);
-        }
-        GGMLQNN_LOG_WARN("error 0x%x: failed to compute on domain %d", hexagon_error, domain_id);
-        goto bail;
-    }
+    void * alloc_rpcmem(size_t bytes, size_t alignment);
+    void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle);
 
-    ggmlop_domain_uri_len   = strlen(ggmlop_URI) + MAX_DOMAIN_NAMELEN;
-    ggmlop_domain_uri       = (char *)malloc(ggmlop_domain_uri_len);
-    snprintf(ggmlop_domain_uri, ggmlop_domain_uri_len, "%s%s", ggmlop_URI, uri);
-    GGMLQNN_LOG_INFO("ggmlop domain uri:%s\n", ggmlop_domain_uri);
-    hexagon_error = ggmlop_dsp_open(ggmlop_domain_uri, &ctx->ggmlop_handle);
-    if (AEE_SUCCESS == hexagon_error) {
-        GGMLQNN_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
-        GGMLQNN_LOG_INFO("only support GGML_OP_ADD and GGML_OP_MUL_MAT on cDSP currently\n");
-        ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_V2_DUTY_CYCLE_MODE, 40, 1);
-        size_t rpcmem_size = 0;
-        ggmlhexagon_probe_dspinfo(ctx, &rpcmem_size);
-        ggmlhexagon_set_rpc_latency(domain_id, RPC_POLL_QOS, 1000);
-    } else {
-        GGMLQNN_LOG_INFO("error 0x%x: failed to open domain %d(%s)", hexagon_error, domain_id,
-                         ggmlhexagon_get_dsp_name(domain_id));
-        goto bail;
-    }
+    void free_rpcmem(void * buf);
+    void free_rpcmem();
 
-    return 0;
-bail:
-    if (ggmlop_domain_uri) {
-        free(ggmlop_domain_uri);
-    }
+    bool is_rpcmem_allocated(void * buf);
 
-    if (ctx->rpc_mempool) {
-        rpcmem_free(ctx->rpc_mempool);
-        ctx->rpc_mempool        = nullptr;
-        ctx->rpc_mempool_len    = 0;
-        ctx->ggmlop_handle      = -1;
-        ctx->domain_id          = -1;
+    bool is_rpcmem_registered(Qnn_MemHandle_t handle) {
+        return _qnn_mem_set.count(handle) != 0U;
     }
 
-    return -1;
-}
-
-static void ggmlhexagon_close_cdsp(ggml_backend_qnn_context * ctx) {
-    int hexagon_error  = AEE_SUCCESS;
-    GGMLQNN_LOG_INFO("enter %s", __func__);
-    if (-1 != ctx->ggmlop_handle) {
-        hexagon_error = ggmlop_dsp_close(ctx->ggmlop_handle);
-        if (AEE_SUCCESS != hexagon_error) {
-            GGMLQNN_LOG_WARN("error 0x%x: failed to close ggmlop dsp handle", hexagon_error);
-        } else {
-            ctx->ggmlop_handle = -1;
-        }
+    bool enable_qnn_rpc() {
+        return _enable_qnn_rpc;
     }
 
-    if (ctx->rpc_mempool) {
-        rpcmem_free(ctx->rpc_mempool);
-        ctx->rpc_mempool        = nullptr;
-        ctx->rpc_mempool_len    = 0;
-        ctx->domain_id          = -1;
+    HEXAGONBackend get_device_id() {
+        return _device_id;
     }
-    GGMLQNN_LOG_INFO("leave %s", __func__);
-}
 
-static void ggmlhexagon_compute(ggml_backend_qnn_context * ctx, struct ggml_tensor * op) {
-    //skip sanity check because already checked in other place
-    struct dsptensor dsptensor_0;
-    struct dsptensor dsptensor_1;
-    struct dsptensor dsptensor_2;
-    std::string op_name;
-    ggmlqnn_get_opkey_from_op(op, op_name);
+private:
+    int load_system();
 
-    qnn_perf op_perf(op_name);
-    op_perf.start();
+    int unload_system();
 
-    int hexagon_error               = AEE_SUCCESS;
-    ggmlhexagon_op_func_t op_func   = nullptr;
-    size_t input_tensor_count       = 2;
+    int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config);
 
-    ggml_tensor * src0  = op->src[0];
-    ggml_tensor * src1  = op->src[1];
-    ggml_tensor * dst   = op;
+    int unload_backend();
 
-    input_tensor_count  =  ggmlhexagon_k_op_caps[ggmlqnn_get_op_index(op)].input_param_count;
-    op_func             =  ggmlhexagon_k_op_caps[ggmlqnn_get_op_index(op)].dsp_op_func;
-    if (nullptr == op_func) {
-        GGMLQNN_LOG_DEBUG("op GGML_OP_%s and dsp func %s not supported on cCSP", ggml_op_name(op->op), ggmlhexagon_k_op_caps[ggmlqnn_get_op_index(op)].hexagon_op_name);
-        return;
+    void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) {
+        _qnn_raw_interface = raw_interface;
     }
 
-    dsptensor_0.data        = src0->data;
-    dsptensor_0.data_len    = ggml_nbytes(src0);
-    dsptensor_0.type        = src0->type;
+    void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) {
+        _qnn_raw_system_interface = raw_interface;
+    }
 
-    dsptensor_0.ne[0] = src0->ne[0];
-    dsptensor_0.ne[1] = src0->ne[1];
-    dsptensor_0.ne[2] = src0->ne[2];
-    dsptensor_0.ne[3] = src0->ne[3];
+    void * alloc_rpcmem_internal(size_t bytes, size_t alignment);
 
-    dsptensor_0.nb[0] = src0->nb[0];
-    dsptensor_0.nb[1] = src0->nb[1];
-    dsptensor_0.nb[2] = src0->nb[2];
-    dsptensor_0.nb[3] = src0->nb[3];
+    void htp_probe_rpc_meminfo();
 
-    if (2 == input_tensor_count) {
-        dsptensor_1.data        = src1->data;
-        dsptensor_1.type        = src1->type;
-        dsptensor_1.data_len    = ggml_nbytes(src1);
+    void htp_print_info();
 
-        dsptensor_1.ne[0] = src1->ne[0];
-        dsptensor_1.ne[1] = src1->ne[1];
-        dsptensor_1.ne[2] = src1->ne[2];
-        dsptensor_1.ne[3] = src1->ne[3];
+    void print_backend_info();
 
-        dsptensor_1.nb[0] = src1->nb[0];
-        dsptensor_1.nb[1] = src1->nb[1];
-        dsptensor_1.nb[2] = src1->nb[2];
-        dsptensor_1.nb[3] = src1->nb[3];
-    }
+    void htp_set_memory_grow_size(size_t size = 1ul * 1024 * 1024);
 
-    dsptensor_2.data        = dst->data;
-    dsptensor_2.data_len    = ggml_nbytes(dst);
-    dsptensor_2.type        = dst->type;
+    void htp_enter_performance_mode();
 
-    dsptensor_2.ne[0] = dst->ne[0];
-    dsptensor_2.ne[1] = dst->ne[1];
-    dsptensor_2.ne[2] = dst->ne[2];
-    dsptensor_2.ne[3] = dst->ne[3];
+    void htp_set_n_hvx_threads(size_t n_threads);
 
-    dsptensor_2.nb[0] = dst->nb[0];
-    dsptensor_2.nb[1] = dst->nb[1];
-    dsptensor_2.nb[2] = dst->nb[2];
-    dsptensor_2.nb[3] = dst->nb[3];
+private:
+    static constexpr const int _required_num_providers = 1;
 
-    //GGMLQNN_DUMP_DSPTENSOR(&dsptensor_0);
-    //GGMLQNN_DUMP_DSPTENSOR(&dsptensor_1);
-    //GGMLQNN_DUMP_DSPTENSOR(&dsptensor_2);
-    hexagon_error = op_func(ctx->ggmlop_handle, &dsptensor_0, &dsptensor_1, &dsptensor_2);
-    if (AEE_SUCCESS != hexagon_error) {
-        GGMLQNN_LOG_WARN("ggmlop %s computation fail on cdsp", ggml_op_name(op->op));
-    }
+private:
+    std::string     _lib_path;
+    std::string     _backend_name;
+    std::string     _model_name; // name of prebuilt QNN model, might be used in the future
+    BackendIdType   _backend_id;
 
-    op_perf.info();
-    return;
-}
+    bool _debug_tensor                      = false; // flag to indicate if requested graph is to be run in debug mode
+    bool _do_node_validations               = true;  // flag to indicate whether all add_node calls need to be validated
+    QnnLog_Level_t _qnn_log_level           = QNN_LOG_LEVEL_DEBUG;
 
-// =================================================================================================
-//  section-7: backend helper function / class
-// =================================================================================================
-static const char * ggmlqnn_get_socmodel_desc(uint32_t soc_model) {
-    switch (soc_model) {
-        case SM7450:
-            return "SM7450";
-        case SM8350:
-            return "SM8350";
-        case SM8450:
-            return "SM8450";
-        case SM8475:
-            return "SM8475";
-        case SM8550:
-            return "SM8550";
-        case SM8650:
-            return "SM8650";
-        case SM8750:
-            return "SM8750";
-        default:
-            return "unknown";
+    qnn_profile_level _profile_level        = PROFILE_OFF;
+
+    void * _system_lib_handle               = nullptr;
+    void * _loaded_lib_handle               = nullptr;
+    const QnnInterface_t * _loaded_backend  = nullptr;
+
+    Qnn_GraphHandle_t _qnn_graph_handle     = nullptr;
+
+    Qnn_LogHandle_t _qnn_log_handle         = nullptr;
+
+    Qnn_ProfileHandle_t _qnn_profile_handle = nullptr;
+
+    Qnn_DeviceHandle_t _qnn_device_handle   = nullptr;
+
+    Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
+
+    Qnn_ContextHandle_t _qnn_context_handle = nullptr;
+
+    QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
+
+    QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr;
+    uint32_t _qnn_htp_powerconfig_id  = 1;
+    uint32_t _qnn_htp_device_id       = 0;
+    uint32_t _qnn_htp_core_id         = 0;
+
+    uint32_t _qnn_rpc_pollingtime     = 9999; // 0-10000 us for high performing
+
+    qnn_interface _qnn_interface;
+    QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
+    QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
+
+    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
+    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
+
+    std::atomic_bool _rpcmem_initialized{false};
+    pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
+    pfn_rpc_mem_free _pfn_rpc_mem_free;
+    pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
+    pfn_rpc_mem_init  _pfn_rpc_mem_init;
+    pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
+    std::unordered_map<void *, void *> _rpcmem_store_map;
+    std::unordered_map<void *, size_t> _rpcmem_usage_map;
+    size_t                             _rpcmem_usage    = 0;   // mempool usage in Mbytes
+    size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
+
+    std::string _graph_name;
+    HEXAGONBackend _device_id;
+    void * _rpc_lib_handle      = nullptr;
+    bool       _enable_qnn_rpc  = false; //TODO:unknown issue with QNN RPC feature
+
+    qnn_instance(const qnn_instance &) = delete;
+    void operator=(const qnn_instance &) = delete;
+
+    qnn_instance(qnn_instance &&) = delete;
+    void operator=(qnn_instance &&) = delete;
+};
+
+void * qnn_instance::alloc_rpcmem_internal(size_t bytes, size_t alignment) {
+    if (!_rpcmem_initialized) {
+        GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n");
+        return nullptr;
     }
-}
 
-static const char * ggmlqnn_get_htparch_desc(size_t htp_arch) {
-    switch (htp_arch) {
-        case V68:
-            return "QCOM_HTP_V68";
-        case V69:
-            return "QCOM_HTP_V69";
-        case V73:
-            return "QCOM_HTP_V73";
-        case V75:
-            return "QCOM_HTP_V75";
-        case V79:
-            return "QCOM_HTP_V79";
-        default:
-            return "unknown";
+    auto allocate_bytes = static_cast<int32_t>(bytes + alignment);
+    void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes);
+    if (nullptr == buf) {
+        GGMLHEXAGON_LOG_WARN("failed to allocate rpc memory\n");
+        return nullptr;
     }
-}
 
-static const char * ggmlqnn_get_hwaccel_approach_name(int hwaccle_approach) {
-    switch (hwaccle_approach) {
-        case HWACCEL_QNN:
-            return "HWACCEL_QNN";
-        case HWACCEL_QNN_SINGLEGRAPH:
-            return "HWACCEL_QNN_SINGLEGRAPH";
-        case HWACCEL_CDSP:
-            return "HWACCEL_CDSP";
-        default:
-            return "unknown hwaccel approach";
+    auto aligned_buf = reinterpret_cast<void *>(ggmlqnn_align_to(alignment,
+                                                reinterpret_cast<intptr_t>(buf)));
+    bool status = _rpcmem_store_map.insert(std::pair<void *, void *>(aligned_buf, buf)).second;
+    if (!status) {
+        GGMLHEXAGON_LOG_WARN("failed to allocate rpc memory\n");
+        _pfn_rpc_mem_free(buf);
     }
+    return aligned_buf;
 }
 
-static struct qcom_socinfo * ggmlqnn_get_socinfo_from_socmodel(uint32_t soc_model) {
-    size_t items = sizeof(g_qnn_soc_info_table) / sizeof(g_qnn_soc_info_table[0]);
-    for (size_t idx = 0; idx < items; idx++) {
-        if (soc_model == g_qnn_soc_info_table[idx].soc_model) {
-            return &g_qnn_soc_info_table[idx];
-        }
+void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
+    if (_rpcmem_usage > (_rpcmem_capacity - 8)) { // reserve 8Mbytes in rpc mempool
+        GGMLHEXAGON_LOG_WARN("rpc mempool capcaity: %d MB, usage: %d MB", _rpcmem_capacity, _rpcmem_usage);
+        return nullptr;
     }
-    return nullptr;
-}
 
-static const char * ggmlqnn_get_ggml_type_name(ggml_type type) {
-    const auto * traits = ggml_get_type_traits(type);
-    return traits->type_name;
-}
+    auto aligned_buf = alloc_rpcmem_internal(bytes, alignment);
+    if (nullptr == aligned_buf)
+        return nullptr;
+    _rpcmem_usage_map.insert(std::pair<void *, size_t>(aligned_buf, bytes));
 
-// ref:explanation of k-quants, https://github.com/ggerganov/llama.cpp/pull/1684
-static Qnn_DataType_t ggmlqnn_datatype_from_ggml_datatype(enum ggml_type ggmltype) {
-    switch (ggmltype) {
-        case GGML_TYPE_F16:
-            return QNN_DATATYPE_FLOAT_16;
-        case GGML_TYPE_F32:
-            return QNN_DATATYPE_FLOAT_32;
-        case GGML_TYPE_I8:
-            return QNN_DATATYPE_INT_8;
-        case GGML_TYPE_Q8_0:
-            return QNN_DATATYPE_SFIXED_POINT_8;
-        case GGML_TYPE_Q4_0:
-            return QNN_DATATYPE_SFIXED_POINT_4;
-        default:
-            break;
-    }
-    return QNN_DATATYPE_UNDEFINED;
+    size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
+    rpcmem_usage_in_bytes += bytes;
+    _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
+    return aligned_buf;
 }
 
-static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
-    switch (qnn_type) {
-        case QNN_DATATYPE_FLOAT_32:
-            return GGML_TYPE_F32;
-        case QNN_DATATYPE_FLOAT_16:
-            return GGML_TYPE_F16;
-        case QNN_DATATYPE_UINT_32:
-        case QNN_DATATYPE_INT_32:
-            return GGML_TYPE_I32;
-        case QNN_DATATYPE_INT_16:
-            return GGML_TYPE_I16;
-        case QNN_DATATYPE_INT_8:
-            return GGML_TYPE_I8;
-        case QNN_DATATYPE_SFIXED_POINT_8:
-            return GGML_TYPE_Q8_0;
-        case QNN_DATATYPE_SFIXED_POINT_4:
-            return GGML_TYPE_Q4_0;
-        default:
-            break;
+void qnn_instance::free_rpcmem(void * buf) {
+    size_t rpcbuffer_size = 0;
+    if (!_rpcmem_initialized) {
+        GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n");
+    } else if (0 == _rpcmem_store_map.count(buf)) {
+        GGMLHEXAGON_LOG_WARN("no allocated tensor\n");
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("free rpc mem %p", _rpcmem_store_map[buf]);
+        for (std::unordered_map<void *, size_t>::iterator it = _rpcmem_usage_map.begin();
+             it != _rpcmem_usage_map.end();
+             it++) {
+            void * rpcbuffer = it->first;
+            if (buf == rpcbuffer) {
+                rpcbuffer_size = it->second;
+                size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
+                rpcmem_usage_in_bytes -= rpcbuffer_size;
+                _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
+            }
+        }
+        if (rpcbuffer_size != 0) {
+            _rpcmem_usage_map.erase(buf);
+        } else {
+            GGMLHEXAGON_LOG_WARN("it shouldn't happen, pls check why?");
+        }
+        _pfn_rpc_mem_free(_rpcmem_store_map[buf]);
+        _rpcmem_store_map.erase(buf);
     }
-    return GGML_TYPE_COUNT;
 }
 
-static void ggmlqnn_get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, const uint32_t * ggml_dimensions, uint32_t rank) {
-    if (rank > GGML_MAX_DIMS) {
-        GGMLQNN_LOG_WARN("invalid params");
-        return;
-    }
-    if (nullptr == qnn_dimensions || nullptr == ggml_dimensions) {
-        GGMLQNN_LOG_WARN("invalid params");
+void qnn_instance::free_rpcmem() {
+    if (_rpcmem_store_map.empty()) {
+        GGMLHEXAGON_LOG_WARN("no rpcmem allocated\n");
         return;
     }
-    for (size_t idx = 0; idx < GGML_MAX_DIMS; idx++)
-        qnn_dimensions[idx] = ggml_dimensions[idx];
 
-    if (rank >= 2) {
-        qnn_dimensions[rank - 1] = ggml_dimensions[rank - 2];
-        qnn_dimensions[rank - 2] = ggml_dimensions[rank - 1];
+    for (std::unordered_map<void *, void *>::iterator it = _rpcmem_store_map.begin();
+         it != _qnn_mem_set.end();
+         it++) {
+        void * rpcbuffer = it->second;
+        GGMLHEXAGON_LOG_DEBUG("free rpc buffer %p", rpcbuffer);
+        _pfn_rpc_mem_free(rpcbuffer);
     }
+    _rpcmem_store_map.clear();
+    _rpcmem_usage_map.clear();
+    _rpcmem_usage = 0;
 }
 
-static void * ggmlqnn_type_trait(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
-    const ggml_tensor * src0        = op->src[0];
-    const ggml_tensor * src1        = op->src[1];
-    ggml_tensor * dst               = op;
-    const enum ggml_type src0_type  = src0->type;
+int32_t qnn_instance::rpcmem_to_fd(void * buf) {
+    int32_t mem_fd = -1;
+    if (!is_rpcmem_initialized()) {
+        GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n");
+    } else {
+        mem_fd = _pfn_rpc_mem_to_fd(buf);
+    }
 
-    GGML_TENSOR_BINARY_OP_LOCALS
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne12);
-    GGML_ASSERT(ne3 == ne13);
-    GGML_ASSERT(nb00 == ggml_type_size(src0_type));
-    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+    return mem_fd;
+}
 
-    const int64_t ne_plane = ne01 * ne00;
-    const size_t desired_size = ((GGML_TYPE_F32 == src0_type) ? 0 : ne03 * ne02 * ne_plane * sizeof(float));
-    ctx->desired_size   = desired_size;
-    if (ctx->work_size < desired_size) {
-        ctx->work_data.reset(new char[desired_size]);
-        ctx->work_size  = desired_size;
+int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
+    if (nullptr == p_data || (nullptr == p_tensor)) {
+        GGMLHEXAGON_LOG_WARN("invalid param\n");
+        return 1;
     }
-    ctx->n_threads = std::thread::hardware_concurrency();
-    void * wdata = ctx->work_data.get();
-    // convert src0 to float
-    if (src0_type != GGML_TYPE_F32) {
-        const auto * type_traits        = ggml_get_type_traits(src0_type);
-        ggml_to_float_t const to_float  = type_traits->to_float;
-
-        for (int64_t i03 = 0; i03 < ne03; i03++) {
-            for (int64_t i02 = 0; i02 < ne02; i02++) {
-                const void * x          = (char *)src0->data + i02 * nb02 + i03 * nb03;
-                float * const wplane    = (float *)wdata + i02 * ne_plane + i03 * ne02 * ne_plane;
 
-                const int min_cols_per_thread = 4096;
-                const int min_rows_per_thread = std::max((int)(min_cols_per_thread / ne00), 1);
-                const int n_threads = std::max(
-                        std::min(ctx->n_threads, (int)(ne01 / min_rows_per_thread)), 1);
-                for (int i = 1; i < n_threads; i++) {
-                    const int64_t start = i * ne01 / n_threads;
-                    const int64_t end   = (i + 1) * ne01 / n_threads;
-                    if (start < end) {
-                        ctx->tasks.push_back(std::async(std::launch::async, [=]() {
-                            for (int64_t i01 = start; i01 < end; i01++) {
-                                to_float((const char *)x + i01 * nb01, wplane + i01 * ne00, ne00);
-                            }
-                        }));
-                    }
-                }
-                {
-                    // reuse the current thread for the first task
-                    const int64_t start = 0;
-                    const int64_t end = ne01 / n_threads;
-                    for (int64_t i01 = start; i01 < end; i01++) {
-                        to_float((const char *) x + i01 * nb01, wplane + i01 * ne00, ne00);
-                    }
-                }
-            }
-        }
+    if (!is_rpcmem_initialized()) {
+        GGMLHEXAGON_LOG_WARN("rpc memory not initialized\n");
+        return 2;
+    }
 
-        // wait for all tasks to finish
-        for (auto &task: ctx->tasks) {
-            task.get();
-        }
-        ctx->tasks.clear();
+    if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
+        GGMLHEXAGON_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
+        return 3;
     }
-    return wdata;
-}
 
-static void ggmlqnn_append_tensor_dimensions(const ggml_tensor * tensor, std::string & output) {
-    char buffer[GGML_QNN_TMPBUF_LEN] = {};
-    const char * type_name = ggmlqnn_get_ggml_type_name(tensor->type);
-    int len = 0;
-    switch (ggml_n_dims(tensor)) {
-        case 1:
-            len = snprintf(buffer, sizeof(buffer), "%ldx1%s", (long)tensor->ne[0], type_name);
-            break;
-        case 2:
-            len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name);
-            break;
-        case 3:
-            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
-                           (long)tensor->ne[2], type_name);
-            break;
-        case 4:
-        default:
-            len = snprintf(buffer, sizeof(buffer), "%ldx%ldx%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1],
-                           (long)tensor->ne[2], (long)tensor->ne[3], type_name);
-            break;
+    int32_t mem_fd = rpcmem_to_fd(p_data);
+    if (-1 == mem_fd) {
+        GGMLHEXAGON_LOG_WARN("failed to get file descriptor\n");
+        return 4;
     }
-    GGML_ASSERT(len > 0 && len < (int)sizeof(buffer));
-    output.append(buffer, len);
-}
+    GGMLHEXAGON_LOG_DEBUG("mem_fd %d\n", mem_fd);
+    Qnn_MemDescriptor_t descriptor = {
+            {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr},
+            QNN_VER_PTR(*p_tensor)->dataType,
+            QNN_MEM_TYPE_ION,
+            {{mem_fd}}};
+    Qnn_MemHandle_t handle = nullptr;
+    int error = QNN_SUCCESS;
+    error = _qnn_interface.qnn_mem_register(
+            _qnn_context_handle,
+            &descriptor,
+            /*numDescriptors=*/1,
+            &handle);
+    if (error != QNN_SUCCESS) {
+        GGMLHEXAGON_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), strerror(error));
+        return 5;
+    } else {
+        GGMLHEXAGON_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
+    }
+    QNN_VER_PTR(*p_tensor)->memHandle = handle;
+    _qnn_mem_set.insert((std::pair<void*, Qnn_MemHandle_t>(p_data, handle)));
 
-static size_t ggmlqnn_get_opcaps_size() {
-    return std::size(ggmlqnn_k_op_caps);
+    return 0;
 }
 
-static size_t ggmlqnn_get_op_index(const ggml_tensor * tensor) {
-    if (tensor->op == GGML_OP_UNARY) {
-        return static_cast<size_t>(GGML_OP_COUNT) + static_cast<size_t>(ggml_get_unary_op(tensor));
+Qnn_MemHandle_t  qnn_instance::register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type) {
+    if (!p_data) {
+        GGMLHEXAGON_LOG_WARN("invalid param");
+        return nullptr;
     }
 
-    return tensor->op;
-}
-
-static size_t ggmlqnn_get_op_input_param_count(const ggml_tensor * op) {
-    auto op_index = ggmlqnn_get_op_index(op);
-    GGML_ASSERT(op_index < std::size(ggmlqnn_k_op_caps));
-    return ggmlqnn_k_op_caps[op_index].input_param_count;
-}
+    if (!is_rpcmem_initialized()) {
+        GGMLHEXAGON_LOG_WARN("rpc memory not initialized");
+        return nullptr;
+    }
 
-static void ggmlqnn_get_opkey_from_op(const ggml_tensor * op, std::string & output) {
-    GGML_ASSERT(op->op != GGML_OP_NONE);
-    output += ggml_op_desc(op);
-    output += ggmlqnn_get_ggml_type_name(op->type);
-    size_t param_count = ggmlqnn_get_op_input_param_count(op);
-    for (size_t i = 0; i < param_count; ++i) {
-        auto * input = op->src[i];
-        if (!input) {
-            break;
-        }
-        output += '_';
-        ggmlqnn_append_tensor_dimensions(input, output);
+    if (is_rpcmem_registered(p_data)) {
+        GGMLHEXAGON_LOG_WARN("rpc memory already registered");
+        return _qnn_rpc_buffer_to_handles[p_data];
     }
-}
 
-static void ggmlqnn_get_opkey_with_srcop_desc(const ggml_tensor * op, std::string & output) {
-    output += ggml_op_desc(op);
-    output += '(';
-    if (op->src[0]) {
-        output += ggml_op_desc(op->src[0]);
+    int32_t mem_fd = rpcmem_to_fd(p_data);
+    if (mem_fd == -1) {
+        GGMLHEXAGON_LOG_WARN("failed to get file descriptor");
+        return nullptr;
     }
-    for (size_t i = 1; i < GGML_MAX_DIMS && op->src[i]; ++i) {
-        output += ',';
-        output += ggml_op_desc(op->src[i]);
+
+    GGMLHEXAGON_LOG_DEBUG("mem_fd %d", mem_fd);
+    Qnn_MemDescriptor_t descriptor = {
+            {rank, dimensions, nullptr},
+            data_type, QNN_MEM_TYPE_ION,
+            {{mem_fd}}
+    };
+    Qnn_MemHandle_t handle = nullptr;
+    Qnn_ErrorHandle_t error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle);
+    if (error != QNN_SUCCESS) {
+        GGMLHEXAGON_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error));
+        return nullptr;
     }
-    output += ')';
+
+    _qnn_rpc_buffer_to_handles.insert({p_data, handle});
+    GGMLHEXAGON_LOG_DEBUG("successfully register shared memory handler: %p", handle);
+    return handle;
 }
 
-static void ggmlqnn_get_graphkey_from_cgraph(const ggml_cgraph * cgraph, std::string & output) {
-    if (nullptr == cgraph || 0 == cgraph->n_nodes) {
-        GGMLQNN_LOG_WARN("empty ggml computational graph");
-        return;
+void * qnn_instance::get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) {
+    for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin();
+         it != _qnn_mem_set.end();
+         it++) {
+        Qnn_MemHandle_t mem_handle = it->second;
+        if (it->second == mem_handle) {
+            return it->first;
+        }
     }
+    GGMLHEXAGON_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle);
+    return nullptr;
+}
 
-    bool is_start = true;
-    for (int i = 0; i < cgraph->n_nodes; ++i) {
-        auto * op = cgraph->nodes[i];
-        if (ggml_is_empty(op)) {
-            GGMLQNN_LOG_WARN("empty op in graph, skipping");
-            continue;
-        }
+void qnn_instance::unregister_rpcmem() {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
-        if (op->op == GGML_OP_NONE) {
-            GGMLQNN_LOG_WARN("GGML_OP_NONE in graph, skipping");
-            continue;
-        }
+    if (_qnn_mem_set.empty()) {
+        GGMLHEXAGON_LOG_WARN("no rpcmem registered\n");
+    }
 
-        if (is_start) {
-            ggmlqnn_get_opkey_from_op(cgraph->nodes[0], output);
-            is_start = false;
+    for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin();
+         it != _qnn_mem_set.end();
+         it++) {
+        Qnn_MemHandle_t mem_handle = it->second;
+        error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
+        if (error != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error));
         } else {
-            output += '#';
-            ggmlqnn_get_opkey_with_srcop_desc(op, output);
+            GGMLHEXAGON_LOG_DEBUG("unregister shared memory ok");
         }
     }
+    _qnn_mem_set.clear();
+}
+
+void qnn_instance::unregister_rpcmem(Qnn_MemHandle_t mem_handle) {
+    Qnn_ErrorHandle_t error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
+    if (error != QNN_SUCCESS) {
+        GGMLHEXAGON_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error));
+    }
 
-    if (cgraph->n_nodes > 1) {
-        auto * last_op = cgraph->nodes[cgraph->n_nodes - 1];
-        output += ggmlqnn_get_ggml_type_name(last_op->type);
-        output += '_';
-        ggmlqnn_append_tensor_dimensions(last_op, output);
+    auto it = std::find_if(_qnn_mem_set.begin(), _qnn_mem_set.end(),
+                           [mem_handle](const auto &kv) { return kv.second == mem_handle; });
+    if (it == _qnn_mem_set.end()) {
+        GGMLHEXAGON_LOG_WARN("failed to find shared memory handler: %p", mem_handle);
+        return;
     }
+
+    _qnn_mem_set.erase(it);
 }
 
-template<typename Fn>
-Fn ggmlqnn_load_qnn_functionpointers(void * handle, const char * function_name) {
-    return reinterpret_cast<Fn>(dlsym(handle, function_name));
+bool qnn_instance::is_rpcmem_allocated(void * buf) {
+    return _rpcmem_store_map.count(buf) != 0U;
 }
 
-class qnn_cfg {
-public:
-    void dump(std::function<void(const std::string &, const std::string &, const std::string &)> worker) {
-        if (!_load_success) {
-            GGMLQNN_LOG_INFO("qnn cfg file %s not loaded", _cfg_filename.c_str());
-            return;
-        }
-        auto iter = _qnn_cfg.begin();
-        while (iter != _qnn_cfg.end()) {
-            auto kv_iter = iter->second.begin();
-            while (kv_iter != iter->second.end()) {
-                worker(iter->first, kv_iter->first, kv_iter->second);
-                ++kv_iter;
-            }
-            ++iter;
-        }
-    }
+int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    GGMLHEXAGON_LOG_DEBUG("lib_path:%s\n", lib_path.c_str());
 
-    bool load(const std::string & file_name) {
-        if (file_name == "") {
-            return false;
-        }
-        _cfg_filename = file_name;
-        std::ifstream in;
-        std::string line;
-        in.open(file_name.c_str());
-        if (not in.is_open()) {
-            GGMLQNN_LOG_WARN("can't open file %s", file_name.c_str());
-            return false;
-        }
-        while (getline(in, line)) {
-            std::string section, key, value;
-            if (not parse_line(line, section, key, value)) {
-                continue;
-            }
-            set_section_keyvalue(section, key, value);
-        }
-        _load_success = true;
-        return true;
+    void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
+    if (nullptr == lib_handle) {
+        GGMLHEXAGON_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror());
+        return 1;
     }
 
-    void get_stringvalue(const std::string & section, const std::string & key, std::string & value, std::string default_value) {
-        value = default_value;
-        if (_qnn_cfg.find(section) == _qnn_cfg.end()) {
-            return;
-        }
-        if (_qnn_cfg[section].find(key) == _qnn_cfg[section].end()) {
-            return;
-        }
-        value = _qnn_cfg[section][key];
+    auto get_providers = ggmlqnn_load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(
+                               lib_handle,
+                               "QnnInterface_getProviders");
+    if (nullptr == get_providers) {
+        GGMLHEXAGON_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror());
+        return 2;
     }
 
-    void get_intvalue(const std::string & section, const std::string & key, int & value, int default_value) {
-        value = default_value;
-        if (_qnn_cfg.find(section) == _qnn_cfg.end()) {
-            return;
-        }
-        if (_qnn_cfg[section].find(key) == _qnn_cfg[section].end()) {
-            return;
-        }
-        value = atol(_qnn_cfg[section][key].c_str());
+    std::uint32_t num_providers = 0;
+    const QnnInterface_t ** provider_list = nullptr;
+    error = get_providers(&provider_list, &num_providers);
+    if (error != QNN_SUCCESS) {
+        GGMLHEXAGON_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error));
+        return 3;
+    }
+    GGMLHEXAGON_LOG_DEBUG("num_providers=%d\n", num_providers);
+    if (num_providers != _required_num_providers) {
+        GGMLHEXAGON_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers);
+        return 4;
     }
 
-private:
-    void ltrim(std::string & str) {
-        if (str.empty()) return;
-        size_t len = 0;
-        char* temp = (char*)str.c_str();
-        while (*temp && isblank(*temp)) {
-            ++len;
-            ++temp;
+    if (nullptr == provider_list) {
+        GGMLHEXAGON_LOG_WARN("failed to get qnn interface providers\n");
+        return 5;
+    }
+    bool found_valid_interface = false;
+    QNN_INTERFACE_VER_TYPE qnn_interface;
+    for (size_t idx = 0; idx < num_providers; idx++) {
+        if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major &&
+            QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) {
+            found_valid_interface = true;
+            qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME;
+            break;
         }
-        if (len > 0) str.erase(0, len);
     }
 
-    void rtrim(std::string & str) {
-        if (str.empty()) return;
-        size_t len = str.length();
-        size_t pos = len;
-        while (pos > 0) {
-            if (not isblank(str[pos - 1])) {
-                break;
-            }
-            --pos;
-        }
-        if (pos != len) str.erase(pos);
-    }
-
-    void trim(std::string& str) {
-        ltrim(str);
-        rtrim(str);
+    if (!found_valid_interface) {
+        GGMLHEXAGON_LOG_WARN("unable to find a valid qnn interface\n");
+        return 6;
+    } else {
+        GGMLHEXAGON_LOG_INFO("find a valid qnn interface\n");
     }
+    set_qnn_raw_interface(qnn_interface);
 
-    void set_section_keyvalue(std::string & section, std::string & key, std::string & value) {
-        if (_qnn_cfg.find(section) == _qnn_cfg.end()) {
-            std::unordered_map<std::string, std::string> kv_map;
-            _qnn_cfg[section] = kv_map;
-        }
-        if (key != "" && value != "") _qnn_cfg[section][key] = value;
-    }
+    BackendIdType backend_id = provider_list[0]->backendId;
+    _loaded_backend     = provider_list[0];
+    _loaded_lib_handle  = lib_handle;
+    _backend_id         = backend_id;
 
-    bool parse_line(std::string & line, std::string & section, std::string & key, std::string & value) {
-        static std::string cur_section = "";
-        std::string nodes[2] = {"#", ";"};
-        for (int i = 0; i < 2; ++i) {
-            std::string::size_type pos = line.find(nodes[i]);
-            if (pos != std::string::npos) line.erase(pos);
-        }
-        trim(line);
-        if (line == "") return false;
-        if (line[0] == '[' && line[line.size() - 1] == ']') {
-            section = line.substr(1, line.size() - 2);
-            trim(section);
-            cur_section = section;
-            return false;
-        }
-        if (cur_section == "") return false;
-        bool is_key = true;
-        for (size_t i = 0; i < line.size(); ++i) {
-            if (line[i] == '=') {
-                is_key = false;
-                continue;
-            }
-            if (is_key) {
-                key += line[i];
-            } else {
-                value += line[i];
-            }
+    auto saver_initialize =
+            ggmlqnn_load_qnn_functionpointers<_pfn_QnnSaver_initialize *>(_loaded_lib_handle, "QnnSaver_initialize");
+    if (nullptr != saver_initialize) {
+        error = saver_initialize(saver_config);
+        if (error != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to saver_initialize，error %d", QNN_GET_ERROR_CODE(error));
+            return 7;
         }
-        section = cur_section;
-        trim(key);
-        trim(value);
-        return true;
+    } else {
+        GGMLHEXAGON_LOG_WARN("saver_initialize is null\n");
     }
-private:
-    std::unordered_map<std::string, std::unordered_map<std::string, std::string>> _qnn_cfg;
-    bool _load_success = false;
-    std::string _cfg_filename;
-};
-
-class qnn_interface {
-#define DEFINE_SHIM_FUNCTION_INTERFACE(F, pointer_name)           \
-  template <typename... Args>                                     \
-  inline auto qnn_##F(Args... args) const {                       \
-    return (_qnn_interface->QNN_INTERFACE_VER_NAME.pointer_name)( \
-        std::forward<Args>(args)...);                             \
-  }
 
+    return 0;
+}
 
-#define DEFINE_SHIM_FUNCTION_SYS_INTERFACE(F, pointer_name)                  \
-  template <typename... Args>                                                \
-  inline auto qnn_##F(Args... args) const {                                  \
-    return (_qnn_sys_interface->QNN_SYSTEM_INTERFACE_VER_NAME.pointer_name)( \
-        std::forward<Args>(args)...);                                        \
-  }
+int qnn_instance::unload_backend() {
+    int dlclose_error = 0;
+    dlclose_error = dlclose(_loaded_lib_handle);
+    if (dlclose_error != 0) {
+        GGMLHEXAGON_LOG_WARN("failed to close QNN backend %d, error %s\n", _backend_id, dlerror());
+    }
 
-    friend class qnn_instance;
+    return 0;
+}
 
-public:
-    qnn_interface() = default;
+int qnn_instance::load_system() {
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
-    // QnnBackend
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_create, backendCreate)
+#if !defined(__ANDROID__) && !defined(__linux__)
+    std::string system_lib_path = _lib_path + "QnnSystem.dll";
+#else
+    std::string system_lib_path = _lib_path + "libQnnSystem.so";
+#endif
+    GGMLHEXAGON_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str());
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_free, backendFree)
+    _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+    if (nullptr == _system_lib_handle) {
+        GGMLHEXAGON_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
+        //re-try with default path of QNN binary runtime lib
+        _lib_path = std::string(g_hexagon_appcfg.runtimelib_path);
+#if !defined(__ANDROID__) && !defined(__linux__)
+        system_lib_path = _lib_path + "QnnSystem.dll";
+#else
+        system_lib_path = _lib_path + "libQnnSystem.so";
+#endif
+        _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
+        if (nullptr == _system_lib_handle) {
+            GGMLHEXAGON_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
+            return 1;
+        }
+    }
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_register_op_package, backendRegisterOpPackage)
+    auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym(
+            _system_lib_handle, "QnnSystemInterface_getProviders"));
+    if (nullptr == get_providers) {
+        GGMLHEXAGON_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror());
+        return 2;
+    }
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_validate_op_config, backendValidateOpConfig)
+    uint32_t num_providers = 0;
+    const QnnSystemInterface_t ** provider_list = nullptr;
+    error = get_providers(&provider_list, &num_providers);
+    if (error != QNN_SUCCESS) {
+        GGMLHEXAGON_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error));
+        return 3;
+    }
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(backend_get_api_version, backendGetApiVersion)
+    if (num_providers != _required_num_providers) {
+        GGMLHEXAGON_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers);
+        return 4;
+    }
 
-    // QnnDevice
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_create, deviceCreate)
+    if (nullptr == provider_list) {
+        GGMLHEXAGON_LOG_WARN("can not get providers\n");
+        return 5;
+    }
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_free, deviceFree)
+    QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface;
+    bool found_valid_system_interface = false;
+    for (size_t idx = 0; idx < num_providers; idx++) {
+        if (QNN_SYSTEM_API_VERSION_MAJOR ==
+            provider_list[idx]->systemApiVersion.major &&
+            QNN_SYSTEM_API_VERSION_MINOR <=
+            provider_list[idx]->systemApiVersion.minor) {
+            found_valid_system_interface = true;
+            qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME;
+            break;
+        }
+    }
+    if (!found_valid_system_interface) {
+        GGMLHEXAGON_LOG_WARN("unable to find a valid qnn system interface\n");
+        return 6;
+    } else {
+        GGMLHEXAGON_LOG_INFO("find a valid qnn system interface\n");
+    }
+    set_qnn_raw_system_interface(qnn_system_interface);
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_infrastructure, deviceGetInfrastructure)
+    _qnn_interface.set_qnn_system_interface(provider_list[0]);
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_platform_info, deviceGetPlatformInfo)
+    _qnn_interface.qnn_system_context_create(&_qnn_system_handle);
+    if (nullptr == _qnn_system_handle) {
+        GGMLHEXAGON_LOG_WARN("can not create QNN system contenxt\n");
+    } else {
+        GGMLHEXAGON_LOG_INFO("initialize qnn system successfully\n");
+    }
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(device_get_info, deviceGetInfo)
+    return 0;
+}
 
-    // QnnContext
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_create, contextCreate)
+int qnn_instance::unload_system() {
+    int result = 0;
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary_size, contextGetBinarySize)
+    if (nullptr == _system_lib_handle) {
+        GGMLHEXAGON_LOG_DEBUG("system lib handle is null\n");
+        return 1;
+    }
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_get_binary, contextGetBinary)
+    if (nullptr != _qnn_system_handle) {
+        result = _qnn_interface.qnn_system_context_free(_qnn_system_handle);
+        if (result != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to free QNN system context\n");
+        }
+        _qnn_system_handle = nullptr;
+    }
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_create_from_binary, contextCreateFromBinary)
+    int dlclose_error = dlclose(_system_lib_handle);
+    if (dlclose_error != 0) {
+        GGMLHEXAGON_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror());
+        return 2;
+    }
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(context_free, contextFree)
+    _system_lib_handle = nullptr;
 
-    // QnnGraph
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_create, graphCreate)
+    return result;
+}
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_add_node, graphAddNode)
+static void ggmlqnn_sdk_logcallback(const char * fmt,
+                                 QnnLog_Level_t level,
+                                 uint64_t timestamp,
+                                 va_list argp) {
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_finalize, graphFinalize)
+    if (0 == g_hexagon_appcfg.print_qnn_internal_log)
+        return;
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_execute, graphExecute)
+    static std::mutex log_mutex;
+    static unsigned char s_ggmlqnn_sdk_logbuf[GGMLHEXAGON_LOGBUF_LEN];
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(graph_retrieve, graphRetrieve)
+    const char * log_level_desc = "";
+    switch (level) {
+        case QNN_LOG_LEVEL_ERROR:
+            log_level_desc = " ERROR ";
+            break;
+        case QNN_LOG_LEVEL_WARN:
+            log_level_desc = "WARNING";
+            break;
+        case QNN_LOG_LEVEL_INFO:
+            log_level_desc = "  INFO ";
+            break;
+        case QNN_LOG_LEVEL_DEBUG:
+            log_level_desc = " DEBUG ";
+            break;
+        case QNN_LOG_LEVEL_VERBOSE:
+            log_level_desc = "VERBOSE";
+            break;
+        case QNN_LOG_LEVEL_MAX:
+            log_level_desc = "UNKNOWN";
+            break;
+    }
 
-    // QnnLog
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_create, logCreate)
+    double ms = (double) timestamp / 1000000.0;
+    {
+        std::lock_guard<std::mutex> lock(log_mutex);
+        memset(s_ggmlqnn_sdk_logbuf, 0, GGMLHEXAGON_LOGBUF_LEN);
+        vsnprintf(reinterpret_cast<char *const>(s_ggmlqnn_sdk_logbuf), GGMLHEXAGON_LOGBUF_LEN, fmt, argp);
+        GGMLHEXAGON_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggmlqnn_sdk_logbuf);
+    }
+}
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_free, logFree)
+int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
+    BackendIdType backend_id = QNN_BACKEND_ID_NULL;
+    GGMLHEXAGON_LOG_DEBUG("enter qni_init\n");
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(log_set_log_level, logSetLogLevel)
+    _device_id = HEXAGON_BACKEND_GGML;
+    if (_backend_name.find("QnnCpu") != std::string::npos) {
+        _device_id = HEXAGON_BACKEND_QNNCPU;
+    }
+    if (_backend_name.find("QnnGpu") != std::string::npos) {
+        _device_id = HEXAGON_BACKEND_QNNGPU;
+    }
+    if (_backend_name.find("QnnHtp") != std::string::npos) {
+        _device_id = HEXAGON_BACKEND_QNNNPU;
+    }
+    if (HEXAGON_BACKEND_GGML == _device_id) {
+        GGMLHEXAGON_LOG_INFO("user specified qnn backend is ggml, skip QNN initialize");
+        return 0;
+    }
 
-    // QnnProfile
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_create, profileCreate)
+    if (0 != load_system()) {
+        GGMLHEXAGON_LOG_WARN("can not load QNN system lib, pls check why?\n");
+        return 1;
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("load QNN system lib successfully\n");
+    }
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_events, profileGetEvents)
+    std::string backend_lib_path = _lib_path + _backend_name;
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_sub_events, profileGetSubEvents)
+    int is_load_ok = load_backend(backend_lib_path, saver_config);
+    if (0 != is_load_ok) {
+        GGMLHEXAGON_LOG_WARN("failed to load QNN backend\n");
+        return 2;
+    }
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_get_event_data, profileGetEventData)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(profile_free, profileFree)
-
-    // QnnMem
-    DEFINE_SHIM_FUNCTION_INTERFACE(mem_register, memRegister)
-
-    DEFINE_SHIM_FUNCTION_INTERFACE(mem_de_register, memDeRegister)
-
-    // QnnProperty
-    DEFINE_SHIM_FUNCTION_INTERFACE(property_has_capability, propertyHasCapability)
-
-    // QnnTensor
-    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_context_tensor, tensorCreateContextTensor)
+    _qnn_interface.set_qnn_interface(_loaded_backend);
+#if 1
+    _qnn_interface.qnn_log_create(ggmlqnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle);
+#else
+    _qnn_raw_interface.logCreate(ggmlqnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle);
+#endif
+    if (nullptr == _qnn_log_handle) {
+        GGMLHEXAGON_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone
+        return 3;
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("initialize qnn log successfully\n");
+    }
 
-    DEFINE_SHIM_FUNCTION_INTERFACE(tensor_create_graph_tensor, tensorCreateGraphTensor)
+    std::vector<const QnnBackend_Config_t *> temp_backend_config;
+    _qnn_interface.qnn_backend_create(_qnn_log_handle,
+                      temp_backend_config.empty() ? nullptr : temp_backend_config.data(),
+                      &_qnn_backend_handle);
+    if (nullptr == _qnn_backend_handle) {
+        GGMLHEXAGON_LOG_WARN("why failed to initialize qnn backend\n");
+        return 4;
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("initialize qnn backend successfully\n");
+    }
 
-    // QnnSystem
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_create, systemContextCreate)
+    if (nullptr != _qnn_raw_interface.propertyHasCapability) {
+        auto qnnstatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE);
+        if (QNN_PROPERTY_NOT_SUPPORTED == qnnstatus) {
+            GGMLHEXAGON_LOG_WARN("device property is not supported\n");
+        }
+        if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnstatus) {
+            GGMLHEXAGON_LOG_WARN("device property is not known to backend\n");
+        }
+    }
 
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_get_binary_info, systemContextGetBinaryInfo)
+    Qnn_ErrorHandle_t qnnstatus = QNN_SUCCESS;
+    if (_device_id == HEXAGON_BACKEND_QNNNPU) {
+        const QnnDevice_PlatformInfo_t * p_info = nullptr;
+        qcom_socinfo soc_info = {};
+        qnnstatus = _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
+        if (QNN_SUCCESS == qnnstatus) {
+            GGMLHEXAGON_LOG_INFO("device counts %d\n", p_info->v1.numHwDevices);
+            QnnDevice_HardwareDeviceInfo_t *         infos    = p_info->v1.hwDevices;
+            QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {};
+            for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) {
+                GGMLHEXAGON_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId,
+                             (int) infos[i].v1.deviceType, (int) infos[i].v1.numCores);
+                QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
+                chipinfo                                = devinfo->onChipDevice;
+                size_t htp_arch                         = (size_t) chipinfo.arch;
+                GGMLHEXAGON_LOG_INFO("htp_type:%d(%s)\n", devinfo->devType,
+                             (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : "");
+                soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize };
+            }
+            _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
+        } else {
+            GGMLHEXAGON_LOG_WARN("failed to get platform info, are we in emulator?\n");
+            soc_info = { NONE, UNKNOWN_SM, 0 };
+        }
 
-    DEFINE_SHIM_FUNCTION_SYS_INTERFACE(system_context_free, systemContextFree)
+        QnnHtpDevice_CustomConfig_t soc_customconfig;
+        soc_customconfig.option    = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
+        soc_customconfig.socModel  = soc_info.soc_model;
+        QnnDevice_Config_t soc_devconfig;
+        soc_devconfig.option       = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+        soc_devconfig.customConfig = &soc_customconfig;
 
-    void set_qnn_interface(const QnnInterface_t * qnn_interface) {
-        _qnn_interface = qnn_interface;
+        /*
+        QnnHtpDevice_CustomConfig_t arch_customconfig;
+        arch_customconfig.option        = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
+        arch_customconfig.arch.arch     = (QnnHtpDevice_Arch_t)soc_info.htp_arch;
+        arch_customconfig.arch.deviceId = 0;
+        QnnDevice_Config_t arch_devconfig;
+        arch_devconfig.option       = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
+        arch_devconfig.customConfig = &arch_customconfig;
+        */
+        const QnnDevice_Config_t * p_deviceconfig[] = { &soc_devconfig, nullptr };
+        qnnstatus = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle);
+    } else {
+        qnnstatus = _qnn_interface.qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle);
     }
-
-    void set_qnn_system_interface(const QnnSystemInterface_t * qnn_sys_interface) {
-        _qnn_sys_interface = qnn_sys_interface;
+    if (QNN_SUCCESS != qnnstatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnstatus) {
+        GGMLHEXAGON_LOG_WARN("failed to create QNN device\n");
+    } else {
+        GGMLHEXAGON_LOG_INFO("create device successfully\n");
     }
 
-    uint32_t get_backend_id() const {
-        return _qnn_interface->backendId;
+    if (PROFILE_OFF != _profile_level) {
+        GGMLHEXAGON_LOG_INFO("profiling turned on; level = %d", _profile_level);
+        if (PROFILE_BASIC == _profile_level) {
+            GGMLHEXAGON_LOG_INFO("basic profiling requested. creating Qnn Profile object\n");
+            if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
+                    _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) {
+                GGMLHEXAGON_LOG_WARN("unable to create profile handle in the backend\n");
+                return 5;
+            } else {
+                GGMLHEXAGON_LOG_DEBUG("initialize qnn profile successfully\n");
+            }
+        } else if (PROFILE_DETAIL == _profile_level) {
+            GGMLHEXAGON_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n");
+            if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
+                    _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) {
+                GGMLHEXAGON_LOG_WARN("unable to create profile handle in the backend\n");
+                return 6;
+            } else {
+                GGMLHEXAGON_LOG_DEBUG("initialize qnn profile successfully\n");
+            }
+        }
     }
 
-    bool is_loaded() const {
-        return ((_qnn_sys_interface != nullptr) && (_qnn_interface != nullptr));
+#if defined(__ANDROID__) || defined(__linux__)
+    std::filesystem::path full_path(std::string(g_hexagon_appcfg.runtimelib_path) + "libcdsprpc.so");
+    full_path /= std::filesystem::path("libcdsprpc.so").filename();
+    _rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
+    if (nullptr == _rpc_lib_handle) {
+        GGMLHEXAGON_LOG_WARN("failed to load %s\n", full_path.c_str());
+        _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+    }
+#else
+    _rpc_lib_handle = dlopen("libcdsprpc.dll", RTLD_NOW | RTLD_LOCAL);
+#endif
+    if (nullptr == _rpc_lib_handle) {
+        GGMLHEXAGON_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror());
+        return 7;
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("load rpcmem lib successfully\n");
+        set_rpcmem_initialized(true);
+    }
+    _pfn_rpc_mem_init   = reinterpret_cast<pfn_rpc_mem_init>(dlsym(_rpc_lib_handle, "rpcmem_init"));
+    _pfn_rpc_mem_deinit = reinterpret_cast<pfn_rpc_mem_deinit>(dlsym(_rpc_lib_handle, "rpcmem_deinit"));
+    _pfn_rpc_mem_alloc  = reinterpret_cast<pfn_rpc_mem_alloc>(dlsym(_rpc_lib_handle,"rpcmem_alloc"));
+    _pfn_rpc_mem_free   = reinterpret_cast<pfn_rpc_mem_free>(dlsym(_rpc_lib_handle, "rpcmem_free"));
+    _pfn_rpc_mem_to_fd  = reinterpret_cast<pfn_rpc_mem_to_fd>(dlsym(_rpc_lib_handle,"rpcmem_to_fd"));
+    if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || nullptr == _pfn_rpc_mem_to_fd) {
+        GGMLHEXAGON_LOG_WARN("unable to access symbols in QNN RPC lib, dlerror(): %s", dlerror());
+        dlclose(_rpc_lib_handle);
+        return 8;
     }
 
-private:
-    const QnnInterface_t * _qnn_interface           = nullptr;
-
-    const QnnSystemInterface_t * _qnn_sys_interface = nullptr;
-};
-
-class qnn_instance {
-public:
-    using BackendIdType = decltype(QnnInterface_t{}.backendId);
-
-    explicit qnn_instance(const std::string & lib_path, const std::string & backend_name,
-                          const std::string & model_name) :
-            _lib_path(std::move(lib_path)),
-            _backend_name(std::move(backend_name)),
-            _model_name(std::move(model_name)) {}
+    if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy
+        _pfn_rpc_mem_init();
 
-    ~qnn_instance() {
+    std::vector<const QnnContext_Config_t *> temp_context_config;
+    _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle,
+                               temp_context_config.empty() ? nullptr : temp_context_config.data(),
+                               &_qnn_context_handle);
+    if (nullptr == _qnn_context_handle) {
+        GGMLHEXAGON_LOG_WARN("why failed to initialize qnn context, error:%s\n", strerror(errno));
+        return 9;
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("initialize qnn context successfully\n");
     }
 
-    int qnn_init(const QnnSaver_Config_t ** saver_config);
-
-    int qnn_finalize();
+    if (_backend_name.find("Htp") != std::string::npos) {
+        htp_print_info();
+        htp_probe_rpc_meminfo();
 
-    const qnn_interface & get_qnn_interface() {
-        if (!_qnn_interface.is_loaded()) {
-            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        if (0 != htp_init_perfinfra()) {
+            GGMLHEXAGON_LOG_WARN("initialize HTP performance failure");
         }
-        return _qnn_interface;
-    }
 
-    const QNN_INTERFACE_VER_TYPE & get_qnn_raw_interface() {
-        if (!_qnn_interface.is_loaded()) {
-            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
-        }
-        return _qnn_raw_interface;
-    }
+        htp_enter_performance_mode();
+        htp_set_memory_grow_size();
 
-    const QNN_SYSTEM_INTERFACE_VER_TYPE & get_qnn_raw_system_interface() {
-        if (!_qnn_interface.is_loaded()) {
-            GGMLQNN_LOG_WARN("pls check why _qnn_interface is not loaded\n");
+        if (enable_qnn_rpc()) {
+            GGMLHEXAGON_LOG_INFO("NPU RPC feature enabled with QNN-NPU backend");
+        } else {
+            GGMLHEXAGON_LOG_INFO("NPU RPC feature disabled with QNN-NPU backend");
         }
-        return _qnn_raw_system_interface;
     }
 
-    Qnn_LogHandle_t get_qnn_log_handle() { return _qnn_log_handle; }
+    print_backend_info();
 
-    Qnn_ProfileHandle_t get_qnn_profile_handle() { return _qnn_profile_handle; }
+    GGMLHEXAGON_LOG_DEBUG("leave qni_init\n");
 
-    Qnn_DeviceHandle_t get_qnn_device_handle() { return _qnn_device_handle; }
+    return 0;
+}
 
-    Qnn_BackendHandle_t get_qnn_backend_handle() { return _qnn_backend_handle; }
+int qnn_instance::qnn_finalize() {
+    int ret_status = 0;
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
 
-    Qnn_ContextHandle_t get_qnn_context_handle() { return _qnn_context_handle; }
+    GGMLHEXAGON_LOG_INFO("enter %s\n", __func__);
+    ggmlqnn_reset_idx();
 
-    QnnSystemContext_Handle_t get_qnn_system_handle() { return _qnn_system_handle; }
+    free_rpcmem();
+    unregister_rpcmem();
 
-    Qnn_GraphHandle_t get_qnn_graph_handle() { return _qnn_graph_handle; }
+    if (nullptr != _pfn_rpc_mem_deinit)
+        _pfn_rpc_mem_deinit();
 
-    int init_qnn_graph(const char * graph_name,
-                       bool debug,
-                       uint8_t do_node_validation = 1,
-                       const QnnGraph_Config_t ** graph_configs = nullptr
-    );
-    int init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb = 8, size_t hvx_threads = 8);
+    if (0 != dlclose(_rpc_lib_handle)) {
+        GGMLHEXAGON_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror());
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("succeed to close rpcmem lib\n");
+    }
 
-    int finalize_qnn_graph();
+    if (nullptr != _qnn_context_handle) {
+        error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
 
-    bool is_valid_graph() const { return _qnn_graph_handle != nullptr; }
+        }
+        _qnn_context_handle = nullptr;
+    }
 
-    int htp_init_perfinfra();
+    if (nullptr != _qnn_profile_handle) {
+        error = _qnn_interface.qnn_profile_free(_qnn_profile_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
 
-    int htp_set_rpc_polling();
+        }
+        _qnn_profile_handle = nullptr;
+    }
 
-    int htp_set_high_performance_mode();
+    if (nullptr != _qnn_device_handle) {
+        error = _qnn_interface.qnn_device_free(_qnn_device_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
 
-    std::string & get_qnn_graph_name() { return _graph_name; }
+        }
+        _qnn_device_handle = nullptr;
+    }
+
+    if (nullptr != _qnn_backend_handle) {
+        error = _qnn_interface.qnn_backend_free(_qnn_backend_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+        }
+        _qnn_backend_handle = nullptr;
 
-    bool is_rpcmem_initialized() {
-        return _rpcmem_initialized;
     }
 
-    void set_rpcmem_initialized(bool initialized) {
-        _rpcmem_initialized = initialized;
+    if (nullptr != _qnn_log_handle) {
+        error = _qnn_interface.qnn_log_free(_qnn_log_handle);
+        if (error != QNN_SUCCESS) {
+            GGMLHEXAGON_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n",
+                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+        }
+        _qnn_log_handle = nullptr;
     }
 
-    size_t get_rpcmem_capacity() { return _rpcmem_capacity; }
-    size_t get_rpcmem_usage() { return _rpcmem_usage; }
+    unload_backend();
+    unload_system();
 
-    int32_t rpcmem_to_fd(void * buf);
+    GGMLHEXAGON_LOG_INFO("leave %s\n", __func__);
+    return ret_status;
+}
 
-    int register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor);
-    Qnn_MemHandle_t  register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type);
+int qnn_instance::init_qnn_graph(const std::string & graph_name, HEXAGONBackend device, size_t vtcm_size_in_mb, size_t hvx_threads) {
+    _graph_name = graph_name;
+    _device_id = device;
 
-    void unregister_rpcmem();
-    void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
+    GGMLHEXAGON_LOG_DEBUG("[%s][%s]created", ggml_backend_hexagon_get_devname(device), graph_name.c_str());
 
-    void * alloc_rpcmem(size_t bytes, size_t alignment);
-    void * get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle);
+    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    if (HEXAGON_BACKEND_QNNNPU == device) {
+        QnnHtpGraph_CustomConfig_t hvx_config;
+        hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
+        hvx_config.numHvxThreads = hvx_threads;
+        QnnGraph_Config_t graph_hvx_config;
+        graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_hvx_config.customConfig = &hvx_config;
 
-    void free_rpcmem(void * buf);
-    void free_rpcmem();
+        QnnHtpGraph_CustomConfig_t dlbc_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
+        dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+        dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
+        if (0 == g_hexagon_appcfg.enable_dlbc)
+            dlbc_config.optimizationOption.floatValue = 0.0; // set to 0.0 to turn off DLBC
+        else
+            dlbc_config.optimizationOption.floatValue = 1.0; // set to 1.0 to turn on  DLBC
+        QnnGraph_Config_t graph_dlbc_config;
+        graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_dlbc_config.customConfig = &dlbc_config;
 
-    bool is_rpcmem_allocated(void * buf);
+        QnnHtpGraph_CustomConfig_t opt_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
+        opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
+        opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
+        opt_config.optimizationOption.floatValue = 1; // 1 / 3
+        QnnGraph_Config_t graph_opt_config;
+        graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_opt_config.customConfig = &opt_config;
 
-    bool is_rpcmem_registered(Qnn_MemHandle_t handle) {
-        return _qnn_mem_set.count(handle) != 0U;
-    }
+        QnnHtpGraph_CustomConfig_t vtcm_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
+        vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
+        vtcm_config.vtcmSizeInMB = vtcm_size_in_mb;
+        QnnGraph_Config_t graph_vtcm_config;
+        graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+        graph_vtcm_config.customConfig = &vtcm_config;
 
-    bool enable_qnn_rpc() {
-        return _enable_qnn_rpc;
+        std::vector<const QnnGraph_Config_t *> graph_configs;
+        graph_configs.push_back(&graph_hvx_config);
+        graph_configs.push_back(&graph_dlbc_config);
+        graph_configs.push_back(&graph_vtcm_config);
+        graph_configs.push_back(&graph_opt_config);
+        if (1 == g_hexagon_appcfg.precision_mode) {
+            QnnHtpGraph_CustomConfig_t fp16_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
+            fp16_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
+            fp16_config.precision = QNN_PRECISION_FLOAT16;
+            QnnGraph_Config_t graph_fp16_config;
+            graph_fp16_config.option       = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
+            graph_fp16_config.customConfig = &fp16_config;
+            graph_configs.push_back(&graph_fp16_config);
+        }
+        graph_configs.push_back(nullptr);
+        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), graph_configs.data(), &_qnn_graph_handle);
+        GGMLHEXAGON_LOG_DEBUG("[%s][%s]created graph %p", ggml_backend_hexagon_get_devname(device), graph_name.c_str(), _qnn_graph_handle);
+    } else {
+        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), nullptr, &_qnn_graph_handle);
     }
-
-    QNNBackend get_device_id() {
-        return _device_id;
+    if (QNN_SUCCESS != error) {
+        GGMLHEXAGON_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s",
+                      ggml_backend_hexagon_get_devname(device), graph_name.c_str(),
+                      ggmlqnn_get_qnnerror_string(error));
+        return error;
     }
 
-private:
-    int load_system();
+    GGMLHEXAGON_LOG_DEBUG("[%s]create graph %s succeed", ggml_backend_hexagon_get_devname(device), graph_name.c_str());
+    if (HEXAGON_BACKEND_QNNNPU == device) {
+        htp_set_n_hvx_threads(hvx_threads);
+    }
+    return QNN_SUCCESS;
+}
 
-    int unload_system();
+int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation,
+                                 const QnnGraph_Config_t ** graph_configs) {
+    Qnn_ErrorHandle_t result = 0;
 
-    int load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config);
+    if (nullptr == graph_name) {
+        GGMLHEXAGON_LOG_WARN("graph name is null\n");
+        return 1;
+    }
 
-    int unload_backend();
+    if (!_graph_name.empty()) {
+        GGMLHEXAGON_LOG_WARN("qnn model for graph %s already initialized\n", graph_name);
+        return 2;
+    }
 
-    void set_qnn_raw_interface(QNN_INTERFACE_VER_TYPE & raw_interface) {
-        _qnn_raw_interface = raw_interface;
+    if (!do_node_validation) {
+        GGMLHEXAGON_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n");
     }
 
-    void set_qnn_raw_system_interface(QNN_SYSTEM_INTERFACE_VER_TYPE & raw_interface) {
-        _qnn_raw_system_interface = raw_interface;
+    _graph_name             = graph_name;
+    _debug_tensor           = debug;
+    _do_node_validations    = do_node_validation;
+
+    result = _qnn_raw_interface.graphCreate(_qnn_context_handle,
+                                            graph_name,
+                                            graph_configs,
+                                            &_qnn_graph_handle);
+    if (QNN_GRAPH_NO_ERROR != result || nullptr == _qnn_graph_handle) {
+        GGMLHEXAGON_LOG_WARN("failed to create graph in qnn context\n");
+        return 3;
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle);
     }
 
-    void * alloc_rpcmem_internal(size_t bytes, size_t alignment);
+    return 0;
+}
 
-    void htp_probe_rpc_meminfo();
+int qnn_instance::finalize_qnn_graph() {
+    if (nullptr != _qnn_graph_handle) {
+        if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle,
+                                             _qnn_profile_handle, nullptr)
+                                             != QNN_GRAPH_NO_ERROR) {
+            GGMLHEXAGON_LOG_WARN("finalizing graph failure\n");
+            return 1;
+        }
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("qnn graph handle is null\n");
+    }
 
-    void print_backend_info();
+    return 0;
+}
 
-    void htp_set_memory_grow_size(size_t size = 1ul * 1024 * 1024);
+int qnn_instance::htp_init_perfinfra() {
+    QnnDevice_Infrastructure_t device_infra = nullptr;
+    Qnn_ErrorHandle_t error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra);
+    if (QNN_SUCCESS != error) {
+        GGMLHEXAGON_LOG_WARN("failed to get qnn device infra\n");
+        return 1;
+    }
 
-    void htp_enter_performance_mode();
+    QnnHtpDevice_Infrastructure_t * htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
+    QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra;
+    uint32_t power_configid = 1;
+    uint32_t device_id      = 0;
+    uint32_t core_id        = 0;
+    htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid);
+    _qnn_htp_perfinfra      = htp_perfinfra;
+    _qnn_htp_powerconfig_id = power_configid;
+    //FIXME:hardcode to 0 and 0 although it's correct
+    _qnn_htp_device_id      = device_id;
+    _qnn_htp_core_id        = core_id;
 
-    void htp_set_n_hvx_threads(size_t n_threads);
+    return 0;
+}
 
-private:
-    static constexpr const int _required_num_providers = 1;
+void qnn_instance::htp_probe_rpc_meminfo() {
+    size_t candidate_size   = 0;
+    uint8_t * rpc_buffer    = nullptr;
+    const int SIZE_IN_MB    = (1 << 20);
+    size_t probe_slots[]    = {1024, 1536, 2048 - 48, 2048};
+    size_t probe_counts     = sizeof(probe_slots) / sizeof(size_t);
+    for (size_t idx = 0; idx < probe_counts; idx++) {
+        rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem_internal(probe_slots[idx] * SIZE_IN_MB, 4));
+        if (nullptr == rpc_buffer) {
+            GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
+            break;
+        } else {
+            candidate_size = probe_slots[idx];
+            free_rpcmem(rpc_buffer);
+            rpc_buffer = nullptr;
+        }
+    }
+    if (candidate_size > _rpcmem_capacity)
+        _rpcmem_capacity = candidate_size;
 
-private:
-    std::string     _lib_path;
-    std::string     _backend_name;
-    std::string     _model_name; // name of prebuilt QNN model, might be used in the future
-    BackendIdType   _backend_id;
+    free_rpcmem();
+    _rpcmem_usage = 0;
+    GGMLHEXAGON_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
+}
+
+void qnn_instance::htp_print_info() {
+    const QnnDevice_PlatformInfo_t * p_info = nullptr;
+    _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
+    GGMLHEXAGON_LOG_DEBUG("HTP device counts %d", p_info->v1.numHwDevices);
+    QnnDevice_HardwareDeviceInfo_t * infos = p_info->v1.hwDevices;
+    for (size_t i = 0; i < p_info->v1.numHwDevices; i++) {
+        GGMLHEXAGON_LOG_DEBUG("HTP deviceID:%d, deviceType:%d, numCores %d", infos[i].v1.deviceId,
+                         infos[i].v1.deviceType, infos[i].v1.numCores);
+        QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
+        QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = devinfo->onChipDevice;
+        QnnHtpDevice_Arch_t htp_arch = chipinfo.arch;
+        GGMLHEXAGON_LOG_DEBUG("HTP_TYPE:%d(%s)", devinfo->devType,
+                         (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "QNN_HTP_DEVICE_TYPE_ON_CHIP" : "QNN_HTP_DEVICE_TYPE_UNKNOWN");
+        GGMLHEXAGON_LOG_DEBUG("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB，" \
+                             "dlbc_support:%d, signedpd_support:%d", \
+                             chipinfo.socModel, ggmlhexagon_get_socmodel_desc(chipinfo.socModel), \
+                             htp_arch, ggmlhexagon_get_htparch_desc(htp_arch), chipinfo.vtcmSize, \
+                             chipinfo.dlbcSupport, chipinfo.signedPdSupport);
+        struct qcom_socinfo * socinfo = ggmlhexagon_get_socinfo_from_socmodel(chipinfo.socModel);
+        g_hexagon_mgr[HEXAGON_BACKEND_QNNNPU].socinfo = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {}};
+        if (nullptr != socinfo) {
+            memcpy(g_hexagon_mgr[HEXAGON_BACKEND_QNNNPU].socinfo.soc_desc, socinfo->soc_desc, sizeof(socinfo->soc_desc));
+            GGMLHEXAGON_LOG_DEBUG("soc info:%s", socinfo->soc_desc);
+        } else {
+            memcpy(g_hexagon_mgr[HEXAGON_BACKEND_QNNNPU].socinfo.soc_desc, "unknown", 7);
+            GGMLHEXAGON_LOG_DEBUG("soc info:unknown");
+        }
+    }
+    _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
+}
 
-    bool _debug_tensor                      = false; // flag to indicate if requested graph is to be run in debug mode
-    bool _do_node_validations               = true;  // flag to indicate whether all add_node calls need to be validated
-    QnnLog_Level_t _qnn_log_level           = QNN_LOG_LEVEL_DEBUG;
+void qnn_instance::print_backend_info() {
+    auto print_property = [&](const char * name, QnnProperty_Key_t property) {
+        auto ret = _qnn_raw_interface.propertyHasCapability(property);
 
-    qnn_profile_level _profile_level        = PROFILE_OFF;
+        const char * status = "Unknown";
+        if (ret == QNN_PROPERTY_SUPPORTED) {
+            status = "Yes";
+        } else if (ret == QNN_PROPERTY_NOT_SUPPORTED) {
+            status = "No";
+        }
 
-    void * _system_lib_handle               = nullptr;
-    void * _loaded_lib_handle               = nullptr;
-    const QnnInterface_t * _loaded_backend  = nullptr;
+        GGMLHEXAGON_LOG_INFO("%s: %s", name, status);
+    };
 
-    Qnn_GraphHandle_t _qnn_graph_handle     = nullptr;
+    GGMLHEXAGON_LOG_INFO("QNN backend properties:");
+    print_property("Create context from binary list", QNN_PROPERTY_CONTEXT_SUPPORT_CREATE_FROM_BINARY_LIST_ASYNC);
+    print_property("Dynamic batch", QNN_PROPERTY_GRAPH_SUPPORT_BATCH_MULTIPLE);
+    print_property("Early termination", QNN_PROPERTY_GRAPH_SUPPORT_EARLY_TERMINATION);
+    print_property("Dynamic dimensions", QNN_PROPERTY_TENSOR_SUPPORT_DYNAMIC_DIMENSIONS);
+    print_property("Blockwise quantization", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCK);
+    print_property("Blockwise quantization with expansion", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION);
+    print_property("Vector quantization", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_VECTOR);
+    print_property("Tensor sparsity", QNN_PROPERTY_TENSOR_SUPPORT_SPARSITY);
+    print_property("Updateable application tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_APP_TENSORS);
+    print_property("Updateable native tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_NATIVE_TENSORS);
+    print_property("Updateable static tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_STATIC_TENSORS);
+    print_property("Qnn group device", QNN_PROPERTY_GROUP_DEVICE);
+}
 
-    Qnn_LogHandle_t _qnn_log_handle         = nullptr;
+void qnn_instance::htp_set_memory_grow_size(size_t size) {
+    QnnHtpPerfInfrastructure_MemoryConfig_t grow_size_config = {
+            .option            = QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE,
+            .memGrowSizeConfig = (uint32_t)size,
+    };
 
-    Qnn_ProfileHandle_t _qnn_profile_handle = nullptr;
+    const QnnHtpPerfInfrastructure_MemoryConfig_t *memory_config[] = {
+            &grow_size_config,
+            nullptr,
+    };
+    Qnn_ErrorHandle_t result = _qnn_htp_perfinfra->setMemoryConfig(_qnn_htp_device_id, _qnn_htp_core_id, memory_config);
+    if (QNN_SUCCESS != result) {
+        GGMLHEXAGON_LOG_WARN("failed to set HTP memory config");
+    } else {
+        GGMLHEXAGON_LOG_INFO("succeed to set HTP memory config");
+    }
+}
 
-    Qnn_DeviceHandle_t _qnn_device_handle   = nullptr;
+void qnn_instance::htp_set_n_hvx_threads(size_t n_threads) {
+    QnnHtpGraph_CustomConfig_t htp_hvx_thread_config = {
+            .option        = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS,
+            .numHvxThreads = n_threads,
+    };
 
-    Qnn_BackendHandle_t _qnn_backend_handle = nullptr;
+    QnnGraph_Config_t hvx_thread_config = {
+            .option       = QNN_GRAPH_CONFIG_OPTION_CUSTOM,
+            .customConfig = &htp_hvx_thread_config,
+    };
 
-    Qnn_ContextHandle_t _qnn_context_handle = nullptr;
+    const QnnGraph_Config_t * graph_configs[] = {&hvx_thread_config, nullptr};
+    Qnn_ErrorHandle_t result     = _qnn_raw_interface.graphSetConfig(_qnn_graph_handle, graph_configs);
+    if (QNN_SUCCESS != result) {
+        GGMLHEXAGON_LOG_WARN("failed to set QNN graph config: set hvx threads %d", n_threads);
+    } else {
+        GGMLHEXAGON_LOG_INFO("succeed to set QNN graph config: set hvx threads %d", n_threads);
+    }
+}
 
-    QnnSystemContext_Handle_t _qnn_system_handle = nullptr;
+void qnn_instance::htp_enter_performance_mode() {
+    QnnHtpPerfInfrastructure_PowerConfig_t dcvs_v3_config = {
+            .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3,
+            .dcvsV3Config =
+                    {
+                            .contextId = _qnn_htp_powerconfig_id,
 
-    QnnHtpDevice_PerfInfrastructure_t * _qnn_htp_perfinfra = nullptr;
-    uint32_t _qnn_htp_powerconfig_id  = 1;
-    uint32_t _qnn_htp_device_id       = 0;
-    uint32_t _qnn_htp_core_id         = 0;
+                            .setDcvsEnable = 1,
+                            .dcvsEnable    = 0,
 
-    uint32_t _qnn_rpc_pollingtime     = 9999; // 0-10000 us for high performing
+                            .powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE,
 
-    qnn_interface _qnn_interface;
-    QNN_INTERFACE_VER_TYPE _qnn_raw_interface;
-    QNN_SYSTEM_INTERFACE_VER_TYPE _qnn_raw_system_interface;
+                            .setSleepLatency = 1,
+                            .sleepLatency    = 40,
 
-    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_mem_set;
-    std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
+                            .setSleepDisable = 1,
+                            .sleepDisable    = 1,
 
-    std::atomic_bool _rpcmem_initialized{false};
-    pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
-    pfn_rpc_mem_free _pfn_rpc_mem_free;
-    pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
-    pfn_rpc_mem_init  _pfn_rpc_mem_init;
-    pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
-    std::unordered_map<void *, void *> _rpcmem_store_map;
-    std::unordered_map<void *, size_t> _rpcmem_usage_map;
-    size_t                             _rpcmem_usage    = 0;   // mempool usage in Mbytes
-    size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
+                            .setBusParams           = 1,
+                            .busVoltageCornerMin    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+                            .busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+                            .busVoltageCornerMax    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
 
-    std::string _graph_name;
-    QNNBackend _device_id;
-    void * _rpc_lib_handle      = nullptr;
-    bool       _enable_qnn_rpc  = false; //TODO:unknown issue with QNN RPC feature
+                            .setCoreParams           = 1,
+                            .coreVoltageCornerMin    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+                            .coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+                            .coreVoltageCornerMax    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+                    },
+    };
 
-    DISABLE_COPY(qnn_instance);
-    DISABLE_MOVE(qnn_instance);
-};
+    QnnHtpPerfInfrastructure_PowerConfig_t hmx_config = {
+            .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_V2,
+            .hmxV2Config =
+                    {
+                            .hmxPickDefault         = 0,
+                            .hmxVoltageCornerMin    = DCVS_EXP_VCORNER_MAX,
+                            .hmxVoltageCornerTarget = DCVS_EXP_VCORNER_MAX,
+                            .hmxVoltageCornerMax    = DCVS_EXP_VCORNER_MAX,
+                            .hmxPerfMode            = QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_HIGH,
+                    },
+    };
 
-void * qnn_instance::alloc_rpcmem_internal(size_t bytes, size_t alignment) {
-    if (!_rpcmem_initialized) {
-        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
-        return nullptr;
-    }
+    QnnHtpPerfInfrastructure_PowerConfig_t rpc_ctrl_config = {
+            .option                  = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY,
+            .rpcControlLatencyConfig = 100,
+    };
 
-    auto allocate_bytes = static_cast<int32_t>(bytes + alignment);
-    void * buf = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, allocate_bytes);
-    if (nullptr == buf) {
-        GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
-        return nullptr;
-    }
+    QnnHtpPerfInfrastructure_PowerConfig_t rpc_poll_config = {
+            .option               = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME,
+            .rpcPollingTimeConfig = 9999,
+    };
 
-    auto aligned_buf = reinterpret_cast<void *>(ggmlqnn_align_to(alignment,
-                                                reinterpret_cast<intptr_t>(buf)));
-    bool status = _rpcmem_store_map.insert(std::pair<void *, void *>(aligned_buf, buf)).second;
-    if (!status) {
-        GGMLQNN_LOG_WARN("failed to allocate rpc memory\n");
-        _pfn_rpc_mem_free(buf);
+    const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {
+            &dcvs_v3_config,
+            &hmx_config,
+            &rpc_ctrl_config,
+            &rpc_poll_config,
+            nullptr,
+    };
+    Qnn_ErrorHandle_t ret = _qnn_htp_perfinfra->setPowerConfig(_qnn_htp_powerconfig_id, power_configs);
+    if (ret != QNN_SUCCESS) {
+        GGMLHEXAGON_LOG_WARN("failed to set HTP power config");
+    } else {
+        GGMLHEXAGON_LOG_INFO("succeed to set HTP power config");
     }
-    return aligned_buf;
 }
 
-void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
-    if (_rpcmem_usage > (_rpcmem_capacity - 8)) { // reserve 8Mbytes in rpc mempool
-        GGMLQNN_LOG_WARN("rpc mempool capcaity: %d MB, usage: %d MB", _rpcmem_capacity, _rpcmem_usage);
+static uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) {
+    if (nullptr == instance || nullptr == ggml_tensor || nullptr == qnn_tensor) {
+        GGMLHEXAGON_LOG_WARN("invalid params\n");
         return nullptr;
     }
 
-    auto aligned_buf = alloc_rpcmem_internal(bytes, alignment);
-    if (nullptr == aligned_buf)
+    uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(ggml_tensor), 4));
+    if (nullptr == qnn_rpcbuffer) {
+        GGMLHEXAGON_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
         return nullptr;
-    _rpcmem_usage_map.insert(std::pair<void *, size_t>(aligned_buf, bytes));
-
-    size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
-    rpcmem_usage_in_bytes += bytes;
-    _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
-    return aligned_buf;
-}
-
-void qnn_instance::free_rpcmem(void * buf) {
-    size_t rpcbuffer_size = 0;
-    if (!_rpcmem_initialized) {
-        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
-    } else if (0 == _rpcmem_store_map.count(buf)) {
-        GGMLQNN_LOG_WARN("no allocated tensor\n");
     } else {
-        GGMLQNN_LOG_DEBUG("free rpc mem %p", _rpcmem_store_map[buf]);
-        for (std::unordered_map<void *, size_t>::iterator it = _rpcmem_usage_map.begin();
-             it != _rpcmem_usage_map.end();
-             it++) {
-            void * rpcbuffer = it->first;
-            if (buf == rpcbuffer) {
-                rpcbuffer_size = it->second;
-                size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
-                rpcmem_usage_in_bytes -= rpcbuffer_size;
-                _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
-            }
-        }
-        if (rpcbuffer_size != 0) {
-            _rpcmem_usage_map.erase(buf);
-        } else {
-            GGMLQNN_LOG_WARN("it shouldn't happen, pls check why?");
-        }
-        _pfn_rpc_mem_free(_rpcmem_store_map[buf]);
-        _rpcmem_store_map.erase(buf);
+        GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %p successfully\n", qnn_rpcbuffer);
     }
+    if (b_copydata)
+        memcpy(qnn_rpcbuffer, ggml_tensor->data, ggml_nbytes(ggml_tensor));
+    instance->register_rpcmem(qnn_rpcbuffer, qnn_tensor);
+    return qnn_rpcbuffer;
 }
 
-void qnn_instance::free_rpcmem() {
-    if (_rpcmem_store_map.empty()) {
-        GGMLQNN_LOG_WARN("no rpcmem allocated\n");
-        return;
-    }
+static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
+                                               Qnn_Param_t * params, uint32_t num_params,
+                                               Qnn_Tensor_t * inputs, uint32_t num_inputs,
+                                               Qnn_Tensor_t * outputs, uint32_t num_outputs) {
 
-    for (std::unordered_map<void *, void *>::iterator it = _rpcmem_store_map.begin();
-         it != _qnn_mem_set.end();
-         it++) {
-        void * rpcbuffer = it->second;
-        GGMLQNN_LOG_DEBUG("free rpc buffer %p", rpcbuffer);
-        _pfn_rpc_mem_free(rpcbuffer);
-    }
-    _rpcmem_store_map.clear();
-    _rpcmem_usage_map.clear();
-    _rpcmem_usage = 0;
-}
+    char opcfg_name[GGML_MAX_NAME] = {};
 
-int32_t qnn_instance::rpcmem_to_fd(void * buf) {
-    int32_t mem_fd = -1;
-    if (!is_rpcmem_initialized()) {
-        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
+    //ensure the opcfg name is unique
+    if (nullptr == name) {
+        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%-8d", ggmlqnn_get_idx(QNN_OPCFG_INDEX));
     } else {
-        mem_fd = _pfn_rpc_mem_to_fd(buf);
+        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%s_%-8d", name, ggmlqnn_get_idx(QNN_OPCFG_INDEX));
     }
+    GGMLHEXAGON_LOG_DEBUG("create qnn opconfig %s", opcfg_name);
+    ggmlqnn_inc_idx(QNN_OPCFG_INDEX);
 
-    return mem_fd;
+    Qnn_OpConfigV1_t v1 = {opcfg_name, package, type,
+                           num_params, params,
+                           num_inputs, inputs,
+                           num_outputs, outputs
+    };
+    Qnn_OpConfig_t opcfg = {QNN_OPCONFIG_VERSION_1, {v1}};
+
+    return opcfg;
 }
 
-int qnn_instance::register_rpcmem(void * p_data, Qnn_Tensor_t * p_tensor) {
-    if (nullptr == p_data || (nullptr == p_tensor)) {
-        GGMLQNN_LOG_WARN("invalid param\n");
-        return 1;
-    }
+static Qnn_Tensor_t * ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
+                                                    const ggml_tensor * tensor, const char * name,
+                                                    Qnn_TensorType_t qnn_tensor_type,
+                                                    Qnn_DataType_t qnn_data_type,
+                                                    uint32_t rank, uint32_t * dims,
+                                                    void * data, uint32_t data_size,
+                                                    bool b_transpose = false) {
+    Qnn_ErrorHandle_t error         = QNN_SUCCESS;
+    char tensor_name[GGML_MAX_NAME] = {};
 
-    if (!is_rpcmem_initialized()) {
-        GGMLQNN_LOG_WARN("rpc memory not initialized\n");
-        return 2;
+    //ensure the tensor name is unique
+    if (nullptr == name) {
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", ggmlqnn_get_idx(QNN_TENSOR_INDEX));
+    } else {
+        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, ggmlqnn_get_idx(QNN_TENSOR_INDEX));
     }
+    GGMLHEXAGON_LOG_DEBUG("init_tensor %s", tensor_name);
+    ggmlqnn_inc_idx(QNN_TENSOR_INDEX);
 
-    if (is_rpcmem_registered((QNN_VER_PTR(*p_tensor)->memHandle))) {
-        GGMLQNN_LOG_WARN("tensor %s has been registered shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
-        return 3;
+    uint32_t reverse_dims[GGML_MAX_DIMS]    = {};
+    uint32_t transpose_dims[GGML_MAX_DIMS]  = {};
+    uint32_t * tensor_dims                  = nullptr;
+    //case 1:use dims info from ggml tensor
+    if (nullptr != tensor) {
+        //there are different dimension order between ggml tensor and qnn tensor
+        for (size_t idx = 0; idx < rank; idx++) {
+            reverse_dims[idx] = (uint32_t)tensor->ne[rank - 1 - idx];
+        }
+        tensor_dims = reverse_dims;
     }
-
-    int32_t mem_fd = rpcmem_to_fd(p_data);
-    if (-1 == mem_fd) {
-        GGMLQNN_LOG_WARN("failed to get file descriptor\n");
-        return 4;
+    //case 2: use user's specified tensor_dims
+    if (nullptr != dims) {
+        tensor_dims = dims;
     }
-    GGMLQNN_LOG_DEBUG("mem_fd %d\n", mem_fd);
-    Qnn_MemDescriptor_t descriptor = {
-            {QNN_VER_PTR(*p_tensor)->rank, QNN_VER_PTR(*p_tensor)->dimensions, nullptr},
-            QNN_VER_PTR(*p_tensor)->dataType,
-            QNN_MEM_TYPE_ION,
-            {{mem_fd}}};
-    Qnn_MemHandle_t handle = nullptr;
-    int error = QNN_SUCCESS;
-    error = _qnn_interface.qnn_mem_register(
-            _qnn_context_handle,
-            &descriptor,
-            /*numDescriptors=*/1,
-            &handle);
-    if (error != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to register shared memory, error %d, %s\n", QNN_GET_ERROR_CODE(error), strerror(error));
-        return 5;
-    } else {
-        GGMLQNN_LOG_INFO("tensor %s successfully register shared memory\n", (QNN_VER_PTR(*p_tensor)->name));
+    //case 3: transpose for dst tensor
+    if (b_transpose) {
+        GGML_ASSERT(tensor != nullptr); //ensure ggml_tensor is not nullptr for this special case
+
+        ggmlqnn_get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_n_dims(tensor));
+        tensor_dims = transpose_dims;
     }
-    QNN_VER_PTR(*p_tensor)->memHandle = handle;
-    _qnn_mem_set.insert((std::pair<void*, Qnn_MemHandle_t>(p_data, handle)));
 
-    return 0;
-}
+    Qnn_Tensor_t qnn_tensor = {
+            .version= QNN_TENSOR_VERSION_1,
+            {.v1= {
+                    .id = 0,
+                    .name = tensor_name,
+                    .type = qnn_tensor_type,
+                    .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
+                    .dataType = qnn_data_type,
+                    .quantizeParams = {.encodingDefinition = QNN_DEFINITION_UNDEFINED,
+                            .quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                            {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}},
+                    .rank = rank,
+                    .dimensions = tensor_dims,
+                    .memType = QNN_TENSORMEMTYPE_RAW,
+                    .clientBuf = {.data = nullptr, .dataSize = 0}
+            }
+            }
+    };
 
-Qnn_MemHandle_t  qnn_instance::register_rpcmem(void * p_data, const uint32_t rank, uint32_t * dimensions, Qnn_DataType_t data_type) {
-    if (!p_data) {
-        GGMLQNN_LOG_WARN("invalid param");
+    Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
+    if (nullptr == p_qnn_tensor) {
+        GGMLHEXAGON_LOG_WARN("calloc failed");
         return nullptr;
     }
-
-    if (!is_rpcmem_initialized()) {
-        GGMLQNN_LOG_WARN("rpc memory not initialized");
-        return nullptr;
+    error = ggmlqnn_deep_copy_qnntensor(qnn_tensor, *p_qnn_tensor);
+    if (error != QNN_SUCCESS) {
+        free(p_qnn_tensor);
+        GGMLHEXAGON_LOG_WARN("init tensor failed");
+        return  nullptr;
     }
 
-    if (is_rpcmem_registered(p_data)) {
-        GGMLQNN_LOG_WARN("rpc memory already registered");
-        return _qnn_rpc_buffer_to_handles[p_data];
+    bool enable_npu_rpc = (instance->enable_qnn_rpc() && instance->get_device_id() == HEXAGON_BACKEND_QNNNPU);
+    if (enable_npu_rpc) {
+        QNN_VER_PTR(*p_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
+        QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {.data=nullptr, .dataSize=0};
+    } else {
+        QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {data, data_size};
     }
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = instance->get_qnn_raw_interface();
+    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_qnn_tensor));
 
-    int32_t mem_fd = rpcmem_to_fd(p_data);
-    if (mem_fd == -1) {
-        GGMLQNN_LOG_WARN("failed to get file descriptor");
-        return nullptr;
-    }
+    return p_qnn_tensor;
+}
 
-    GGMLQNN_LOG_DEBUG("mem_fd %d", mem_fd);
-    Qnn_MemDescriptor_t descriptor = {
-            {rank, dimensions, nullptr},
-            data_type, QNN_MEM_TYPE_ION,
-            {{mem_fd}}
-    };
-    Qnn_MemHandle_t handle = nullptr;
-    Qnn_ErrorHandle_t error = _qnn_interface.qnn_mem_register(_qnn_context_handle, &descriptor, /*numDescriptors=*/1, &handle);
-    if (error != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to register shared memory, error %d, %s", QNN_GET_ERROR_CODE(error), strerror(error));
-        return nullptr;
-    }
-
-    _qnn_rpc_buffer_to_handles.insert({p_data, handle});
-    GGMLQNN_LOG_DEBUG("successfully register shared memory handler: %p", handle);
-    return handle;
-}
-
-void * qnn_instance::get_rpcmem_from_memhandle(Qnn_MemHandle_t mem_handle) {
-    for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin();
-         it != _qnn_mem_set.end();
-         it++) {
-        Qnn_MemHandle_t mem_handle = it->second;
-        if (it->second == mem_handle) {
-            return it->first;
-        }
-    }
-    GGMLQNN_LOG_WARN("can't find rpcmem from qnn mem handle %p", mem_handle);
-    return nullptr;
-}
-
-void qnn_instance::unregister_rpcmem() {
+static Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
+                          const ggml_tensor * tensor, Qnn_TensorType_t tensor_type) {
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
+    uint32_t dimensions[]   = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
+                               (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
+    Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
+    Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
 
-    if (_qnn_mem_set.empty()) {
-        GGMLQNN_LOG_WARN("no rpcmem registered\n");
-    }
-
-    for (std::unordered_map<void *, Qnn_MemHandle_t>::iterator it = _qnn_mem_set.begin();
-         it != _qnn_mem_set.end();
-         it++) {
-        Qnn_MemHandle_t mem_handle = it->second;
-        error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d\n", QNN_GET_ERROR_CODE(error));
-        } else {
-            GGMLQNN_LOG_DEBUG("unregister shared memory ok");
+    if (0 == tensor->flags) {
+        qnn_tensor_type = tensor_type;
+    } else {
+        if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
+            qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+        } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
+            qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
         }
     }
-    _qnn_mem_set.clear();
+
+    qnn_data_type = ggmlqnn_datatype_from_ggml_datatype(tensor->type);
+    Qnn_Tensor_t * p_qnn_tensor = ggmlqnn_create_general_tensor(instance, graph_handle, tensor, nullptr,
+                                      qnn_tensor_type, qnn_data_type,
+                                      ggml_n_dims(tensor), dimensions,
+                                      nullptr, 0);
+    return p_qnn_tensor;
 }
 
-void qnn_instance::unregister_rpcmem(Qnn_MemHandle_t mem_handle) {
-    Qnn_ErrorHandle_t error = _qnn_interface.qnn_mem_de_register(&mem_handle, 1);
-    if (error != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to unregister shared memory, error %d", QNN_GET_ERROR_CODE(error));
-    }
+// =================================================================================================
+//  section-6: hwaccel approach through QNN: offload GGML op to QNN backend
+// =================================================================================================
+/*
+ * provide a general skeleton to offload ggml op to QNN backend: perform element-wise
+ * operation on 1/2 input tensors and 1 output tensors
+*/
+static void ggmlqnn_compute_elementwise(ggml_backend_hexagon_context * ctx, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
+    qnn_instance * instance                     = nullptr;
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    Qnn_Tensor_t * p_tensor0                    = nullptr;
+    Qnn_Tensor_t * p_tensor1                    = nullptr;
+    Qnn_Tensor_t * p_tensor2                    = nullptr;
+    const ggml_tensor * src0                    = op->src[0];
+    const ggml_tensor * src1                    = op->src[1];
+    ggml_tensor * dst                           = op;
 
-    auto it = std::find_if(_qnn_mem_set.begin(), _qnn_mem_set.end(),
-                           [mem_handle](const auto &kv) { return kv.second == mem_handle; });
-    if (it == _qnn_mem_set.end()) {
-        GGMLQNN_LOG_WARN("failed to find shared memory handler: %p", mem_handle);
-        return;
-    }
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+    size_t qnn_op_index                         = ggmlhexagon_get_op_index(op);
+    const char * qnn_op_name                    = ggmlqnn_k_op_caps[qnn_op_index].qnn_op_name;
+    size_t input_param_count                    = ggmlqnn_k_op_caps[qnn_op_index].input_param_count;
+    std::string ggml_op_name_string             = std::string("ggml_") + ggml_op_name(op->op);
+    const char * ggml_op_name                   = ggml_op_name_string.c_str();
 
-    _qnn_mem_set.erase(it);
-}
+    std::string graph_name;
+    ggmlhexagon_get_opkey_from_op(op, graph_name);
 
-bool qnn_instance::is_rpcmem_allocated(void * buf) {
-    return _rpcmem_store_map.count(buf) != 0U;
-}
+    hexagon_perf op_perf(graph_name);
+    op_perf.start();
 
-int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t ** saver_config) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    GGMLQNN_LOG_DEBUG("lib_path:%s\n", lib_path.c_str());
+    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == HEXAGON_BACKEND_QNNNPU;
+    if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
+        //retrieve computational resource from cached QNN graph
+        qnn_singlenode_res_t & graph_item = ctx->qnn_singlenode_graph_map[graph_name];
+        graph_handle                      = std::get<0>(graph_item);
+        qnn_ptensors_t & ptensors         = std::get<1>(graph_item);
+        p_tensor0  = ptensors[0];
+        if (2 == input_param_count) {
+            p_tensor1 = ptensors[1];
+            p_tensor2 = ptensors[2];
+        } else {
+            //now p_tensor1 is nullptr
+            p_tensor2 = ptensors[1];
+        }
+    } else {
+        GGML_ASSERT(instance->get_device_id() == ctx->device);
+        GGMLHEXAGON_LOG_INFO("graph name %s", graph_name.c_str());
+        //create QNN graph
+        error = instance->init_qnn_graph(graph_name, static_cast<HEXAGONBackend>(ctx->device),
+                                         g_hexagon_appcfg.vtcm_size_in_mb,
+                                         g_hexagon_appcfg.hvx_threads);
+        if (QNN_SUCCESS != error) {
+            GGMLHEXAGON_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
+            return;
+        }
+        graph_handle = instance->get_qnn_graph_handle();
 
-    void * lib_handle = dlopen(lib_path.c_str(), RTLD_NOW | RTLD_GLOBAL);
-    if (nullptr == lib_handle) {
-        GGMLQNN_LOG_WARN("can not open QNN library %s, with error: %s", lib_path.c_str(), dlerror());
-        return 1;
-    }
+        GGMLHEXAGON_LOG_DEBUG("graph_handle %p", graph_handle);
+        //create computational tensor
+        p_tensor0 = ggmlqnn_create_compute_tensor(instance, graph_handle, src0, QNN_TENSOR_TYPE_APP_WRITE);
+        if (2 == input_param_count) {
+            p_tensor1 = ggmlqnn_create_compute_tensor(instance, graph_handle, src1, QNN_TENSOR_TYPE_APP_WRITE);
+        }
+        p_tensor2 = ggmlqnn_create_compute_tensor(instance, graph_handle, dst, QNN_TENSOR_TYPE_APP_READ);
 
-    auto get_providers = ggmlqnn_load_qnn_functionpointers<_pfn_QnnInterface_getProviders *>(
-                               lib_handle,
-                               "QnnInterface_getProviders");
-    if (nullptr == get_providers) {
-        GGMLQNN_LOG_WARN("can not load symbol QnnInterface_getProviders : %s", dlerror());
-        return 2;
-    }
+        //compose QNN graph
+        qnn_tensors_t input_tensors;
+        input_tensors.reserve(input_param_count);
+        input_tensors.push_back(*p_tensor0);
+        if (2 == input_param_count) {
+            input_tensors.push_back(*p_tensor1);
+        }
+        Qnn_Tensor_t output_tensors[] = {
+                *p_tensor2
+        };
+        Qnn_OpConfig_t op_config = ggmlqnn_create_op_config(ggml_op_name,
+                                                            QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                            qnn_op_name, nullptr, 0,
+                                                            input_tensors.data(),
+                                                            input_param_count, output_tensors,
+                                                            1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
+        //finalize QNN graph
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
 
-    std::uint32_t num_providers = 0;
-    const QnnInterface_t ** provider_list = nullptr;
-    error = get_providers(&provider_list, &num_providers);
-    if (error != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to get providers, error %d", QNN_GET_ERROR_CODE(error));
-        return 3;
-    }
-    GGMLQNN_LOG_DEBUG("num_providers=%d\n", num_providers);
-    if (num_providers != _required_num_providers) {
-        GGMLQNN_LOG_WARN("providers is %d instead of required %d", num_providers, _required_num_providers);
-        return 4;
-    }
+        //cache QNN graph
+        qnn_ptensors_t qnn_elementwise_tensors;
+        qnn_elementwise_tensors.reserve(input_param_count + 1);
 
-    if (nullptr == provider_list) {
-        GGMLQNN_LOG_WARN("failed to get qnn interface providers\n");
-        return 5;
-    }
-    bool found_valid_interface = false;
-    QNN_INTERFACE_VER_TYPE qnn_interface;
-    for (size_t idx = 0; idx < num_providers; idx++) {
-        if (QNN_API_VERSION_MAJOR == provider_list[idx]->apiVersion.coreApiVersion.major &&
-            QNN_API_VERSION_MINOR <= provider_list[idx]->apiVersion.coreApiVersion.minor) {
-            found_valid_interface = true;
-            qnn_interface = provider_list[idx]->QNN_INTERFACE_VER_NAME;
-            break;
+        qnn_elementwise_tensors.push_back(p_tensor0);
+        if (2 == input_param_count) {
+            qnn_elementwise_tensors.push_back(p_tensor1);
         }
+        qnn_elementwise_tensors.push_back(p_tensor2);
+        auto graph_item = std::make_tuple(graph_handle, qnn_elementwise_tensors);
+        ctx->qnn_singlenode_graph_map[graph_name] = graph_item;
     }
 
-    if (!found_valid_interface) {
-        GGMLQNN_LOG_WARN("unable to find a valid qnn interface\n");
-        return 6;
-    } else {
-        GGMLQNN_LOG_INFO("find a valid qnn interface\n");
-    }
-    set_qnn_raw_interface(qnn_interface);
-
-    BackendIdType backend_id = provider_list[0]->backendId;
-    _loaded_backend     = provider_list[0];
-    _loaded_lib_handle  = lib_handle;
-    _backend_id         = backend_id;
+    if (enable_npu_rpc) {
+        uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(
+                QNN_VER_PTR(*p_tensor0)->memHandle));
+        GGMLHEXAGON_LOG_DEBUG("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
+        if (nullptr != qnn_buffer_0) {
+            memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
+        }
 
-    auto saver_initialize =
-            ggmlqnn_load_qnn_functionpointers<_pfn_QnnSaver_initialize *>(_loaded_lib_handle, "QnnSaver_initialize");
-    if (nullptr != saver_initialize) {
-        error = saver_initialize(saver_config);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to saver_initialize，error %d", QNN_GET_ERROR_CODE(error));
-            return 7;
+        if (2 == input_param_count) {
+            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(
+                    QNN_VER_PTR(*p_tensor1)->memHandle));
+            GGMLHEXAGON_LOG_DEBUG("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
+            if (nullptr != qnn_buffer_1) {
+                memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
+            }
         }
     } else {
-        GGMLQNN_LOG_WARN("saver_initialize is null\n");
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+        if (2 == input_param_count) {
+            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+        }
+        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
     }
 
-    return 0;
-}
-
-int qnn_instance::unload_backend() {
-    int dlclose_error = 0;
-    dlclose_error = dlclose(_loaded_lib_handle);
-    if (dlclose_error != 0) {
-        GGMLQNN_LOG_WARN("failed to close QNN backend %d, error %s\n", _backend_id, dlerror());
+    qnn_tensors_t input_tensors;
+    input_tensors.reserve(input_param_count);
+    input_tensors.push_back(*p_tensor0);
+    if (2 == input_param_count) {
+        input_tensors.push_back(*p_tensor1);
+    }
+    Qnn_Tensor_t output_tensors[] = {
+            *p_tensor2
+    };
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                        input_tensors.data(), input_param_count,
+                                                        output_tensors, 1,
+                                                        nullptr, nullptr));
+    if (enable_npu_rpc) {
+        uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
+        if (nullptr != qnn_buffer_2) {
+            memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
+        }
     }
 
-    return 0;
+    op_perf.info();
 }
 
-int qnn_instance::load_system() {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+/*
+ * this function is AI-assisted code from Grok 3 for purpose of offload 4d matrix mulmat to QNN backend
+ * various UT has verified and succeed but failed in CT of test-backend-ops
+ *
+ * the logic of ggmlqnn_compute_mul_mat_4d is similar to ggmlqnn_compute_mul_mat but much more complicated
+ * than ggmlqnn_compute_mul_mat, so it's a standalone function.
+ * it will be combined with ggmlqnn_compute_mul_mat in the future
+ */
+static void ggmlqnn_compute_mul_mat_4d(ggml_backend_hexagon_context * ctx, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error     = QNN_SUCCESS;
+    bool graph_initialized      = false;
+    qnn_instance * instance     = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
 
-#if !defined(__ANDROID__) && !defined(__linux__)
-    std::string system_lib_path = _lib_path + "QnnSystem.dll";
-#else
-    std::string system_lib_path = _lib_path + "libQnnSystem.so";
-#endif
-    GGMLQNN_LOG_DEBUG("system_lib_path:%s\n", system_lib_path.c_str());
+    const ggml_tensor * src0 = op->src[0];
+    const ggml_tensor * src1 = op->src[1];
+    ggml_tensor * dst        = op;
 
-    _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
-    if (nullptr == _system_lib_handle) {
-        GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
-        //re-try with default path of QNN binary runtime lib
-        _lib_path = std::string(g_qnn_params.qnn_runtimelib_path);
-#if !defined(__ANDROID__) && !defined(__linux__)
-        system_lib_path = _lib_path + "QnnSystem.dll";
-#else
-        system_lib_path = _lib_path + "libQnnSystem.so";
-#endif
-        _system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
-        if (nullptr == _system_lib_handle) {
-            GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
-            return 1;
-        }
-    }
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4);
 
-    auto * get_providers = reinterpret_cast<_pfn_QnnSystemInterface_getProviders *>(dlsym(
-            _system_lib_handle, "QnnSystemInterface_getProviders"));
-    if (nullptr == get_providers) {
-        GGMLQNN_LOG_WARN("can not load QNN symbol QnnSystemInterface_getProviders: %s\n", dlerror());
-        return 2;
-    }
+    hexagon_perf op_perf("ggmlqnn_compute_mul_mat_4d");
+    op_perf.start();
 
-    uint32_t num_providers = 0;
-    const QnnSystemInterface_t ** provider_list = nullptr;
-    error = get_providers(&provider_list, &num_providers);
-    if (error != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to get providers, error %d\n", QNN_GET_ERROR_CODE(error));
-        return 3;
-    }
+    std::string graph_name;
+    ggmlhexagon_get_opkey_from_op(op, graph_name);
+    GGMLHEXAGON_LOG_DEBUG("graph name %s\n", graph_name.c_str());
 
-    if (num_providers != _required_num_providers) {
-        GGMLQNN_LOG_WARN("providers is %d instead of required %d\n", num_providers, _required_num_providers);
-        return 4;
-    }
+    ggmlhexagon_print_tensors_info(__func__, ctx, src0, src1, dst);
 
-    if (nullptr == provider_list) {
-        GGMLQNN_LOG_WARN("can not get providers\n");
-        return 5;
-    }
+    Qnn_GraphHandle_t graph_handle  = nullptr;
+    Qnn_Tensor_t * p_tensor0        = nullptr;
+    Qnn_Tensor_t * p_reshape0_out   = nullptr;
+    Qnn_Tensor_t * p_tile0_out      = nullptr;
+    Qnn_Tensor_t * p_tensor1        = nullptr;
+    Qnn_Tensor_t * p_permute1_out   = nullptr;
+    Qnn_Tensor_t * p_reshape1_out   = nullptr;
+    Qnn_Tensor_t * p_matmul_out     = nullptr;
+    Qnn_Tensor_t * p_reshape2_out   = nullptr;
 
-    QNN_SYSTEM_INTERFACE_VER_TYPE qnn_system_interface;
-    bool found_valid_system_interface = false;
-    for (size_t idx = 0; idx < num_providers; idx++) {
-        if (QNN_SYSTEM_API_VERSION_MAJOR ==
-            provider_list[idx]->systemApiVersion.major &&
-            QNN_SYSTEM_API_VERSION_MINOR <=
-            provider_list[idx]->systemApiVersion.minor) {
-            found_valid_system_interface = true;
-            qnn_system_interface = provider_list[idx]->QNN_SYSTEM_INTERFACE_VER_NAME;
-            break;
-        }
-    }
-    if (!found_valid_system_interface) {
-        GGMLQNN_LOG_WARN("unable to find a valid qnn system interface\n");
-        return 6;
+    if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
+        graph_initialized = true;
+        qnn_singlenode_res_t & graph_item   = ctx->qnn_singlenode_graph_map[graph_name];
+        graph_handle                        = std::get<0>(graph_item);
+        qnn_ptensors_t & tensors            = std::get<1>(graph_item);
+        p_tensor0                           = tensors[0];
+        p_reshape0_out                      = tensors[1];
+        p_tile0_out                         = tensors[2];
+        p_tensor1                           = tensors[3];
+        p_permute1_out                      = tensors[4];
+        p_reshape1_out                      = tensors[5];
+        p_matmul_out                        = tensors[6];
+        p_reshape2_out                      = tensors[7];
     } else {
-        GGMLQNN_LOG_INFO("find a valid qnn system interface\n");
-    }
-    set_qnn_raw_system_interface(qnn_system_interface);
+        CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), NULL, &graph_handle));
 
-    _qnn_interface.set_qnn_system_interface(provider_list[0]);
+        // Define dimensions
+        uint32_t K = src0->ne[0];               // Inner dimension
+        uint32_t M = src0->ne[1];               // Rows of src0
+        uint32_t N = src1->ne[1];               // Columns of src1
+        uint32_t B0 = src0->ne[2] * src0->ne[3]; // src0 batch
+        uint32_t B1 = src1->ne[2] * src1->ne[3]; // src1 batch (drives output)
 
-    _qnn_interface.qnn_system_context_create(&_qnn_system_handle);
-    if (nullptr == _qnn_system_handle) {
-        GGMLQNN_LOG_WARN("can not create QNN system contenxt\n");
-    } else {
-        GGMLQNN_LOG_INFO("initialize qnn system successfully\n");
-    }
+        // Validate K only
+        GGML_ASSERT(src0->ne[0] == src1->ne[0]); // K must match
 
-    return 0;
-}
+        // src0: [K, M, H0, B0] -> QNN: [B0, H0, M, K]
+        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]),
+                                static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])
+        };
+        p_tensor0 = ggmlqnn_create_general_tensor(instance, graph_handle, src0, "input0",
+                                                  QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
+                                                  src0_dims, nullptr, 0);
 
-int qnn_instance::unload_system() {
-    int result = 0;
+        // Reshape src0 to [B0, M, K]
+        uint32_t reshape0_out_dims[] = {B0, M, K};
+        p_reshape0_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "reshape0_out",
+                                                       QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                                                       reshape0_out_dims, nullptr, 0);
 
-    if (nullptr == _system_lib_handle) {
-        GGMLQNN_LOG_DEBUG("system lib handle is null\n");
-        return 1;
-    }
+        Qnn_Tensor_t reshape0_inputs[]  = {*p_tensor0};
+        Qnn_Tensor_t reshape0_outputs[] = {*p_reshape0_out};
+        Qnn_OpConfig_t reshape0_op      = ggmlqnn_create_op_config("reshape0", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                                   QNN_OP_RESHAPE, nullptr, 0,
+                                                                   reshape0_inputs, 1, reshape0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape0_op));
 
-    if (nullptr != _qnn_system_handle) {
-        result = _qnn_interface.qnn_system_context_free(_qnn_system_handle);
-        if (result != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to free QNN system context\n");
-        }
-        _qnn_system_handle = nullptr;
-    }
+        // Tile src0 to match B1: [B0, M, K] -> [B1, M, K]
+        uint32_t tile0_out_dims[] = {B1, M, K};
+        p_tile0_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "tile0_out",
+                                                    QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                                                    tile0_out_dims, nullptr, 0);
 
-    int dlclose_error = dlclose(_system_lib_handle);
-    if (dlclose_error != 0) {
-        GGMLQNN_LOG_WARN("failed to close QnnSystem library, error %s\n", dlerror());
-        return 2;
-    }
+        uint32_t tile_multiples[] = {B1 / B0, 1, 1};
+        uint32_t tile_dims[] = {3};
+        Qnn_Tensor_t * p_tile_multiples = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "tile_multiples",
+                                                                        QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                                                        tile_dims, tile_multiples, sizeof(tile_multiples));
 
-    _system_lib_handle = nullptr;
+        Qnn_Param_t tile_params[]       = {{QNN_PARAMTYPE_TENSOR, "multiples", .tensorParam = *p_tile_multiples}};
+        Qnn_Tensor_t tile0_inputs[]     = {*p_reshape0_out};
+        Qnn_Tensor_t tile0_outputs[]    = {*p_tile0_out};
+        Qnn_OpConfig_t tile0_op         = ggmlqnn_create_op_config("tile0", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                                   QNN_OP_TILE, tile_params, 1,
+                                                                   tile0_inputs, 1, tile0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, tile0_op));
 
-    return result;
-}
+        // src1: [N, K, H1, B1] -> QNN: [B1, H1, N, K]
+        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]),
+                                static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])
+        };
+        p_tensor1 = ggmlqnn_create_general_tensor(instance, graph_handle, src1, "input1",
+                                                  QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
+                                                  src1_dims, nullptr, 0);
 
-static void ggmlqnn_sdk_logcallback(const char * fmt,
-                                 QnnLog_Level_t level,
-                                 uint64_t timestamp,
-                                 va_list argp) {
 
-    if (0 == g_qnn_params.print_qnn_internal_log)
-        return;
+        // Permute src1 to [B1, H1, K, N]
+        uint32_t perm_data[] = {0, 1, 3, 2};
+        uint32_t perm_dims[] = {4};
+        Qnn_Tensor_t * p_perm = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "perm",
+                                                              QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
+                                                              perm_dims, perm_data, sizeof(perm_data));
 
-    static std::mutex log_mutex;
-    static unsigned char s_ggmlqnn_sdk_logbuf[GGML_QNN_LOGBUF_LEN];
+        uint32_t permute1_out_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]),
+                                        static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1])
+        };
+        p_permute1_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "permute1_out",
+                                                       QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
+                                                       permute1_out_dims, nullptr, 0);
 
-    const char * log_level_desc = "";
-    switch (level) {
-        case QNN_LOG_LEVEL_ERROR:
-            log_level_desc = " ERROR ";
-            break;
-        case QNN_LOG_LEVEL_WARN:
-            log_level_desc = "WARNING";
-            break;
-        case QNN_LOG_LEVEL_INFO:
-            log_level_desc = "  INFO ";
-            break;
-        case QNN_LOG_LEVEL_DEBUG:
-            log_level_desc = " DEBUG ";
-            break;
-        case QNN_LOG_LEVEL_VERBOSE:
-            log_level_desc = "VERBOSE";
-            break;
-        case QNN_LOG_LEVEL_MAX:
-            log_level_desc = "UNKNOWN";
-            break;
-    }
+        Qnn_Param_t permute1_params[]   = {{QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_perm}};
+        Qnn_Tensor_t permute1_inputs[]  = {*p_tensor1};
+        Qnn_Tensor_t permute1_outputs[] = {*p_permute1_out};
+        Qnn_OpConfig_t permute1_op      = ggmlqnn_create_op_config("permute1", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                                   QNN_OP_TRANSPOSE, permute1_params, 1,
+                                                                   permute1_inputs, 1, permute1_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, permute1_op));
 
-    double ms = (double) timestamp / 1000000.0;
-    {
-        std::lock_guard<std::mutex> lock(log_mutex);
-        memset(s_ggmlqnn_sdk_logbuf, 0, GGML_QNN_LOGBUF_LEN);
-        vsnprintf(reinterpret_cast<char *const>(s_ggmlqnn_sdk_logbuf), GGML_QNN_LOGBUF_LEN, fmt, argp);
-        GGMLQNN_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggmlqnn_sdk_logbuf);
-    }
-}
+        // Reshape src1 to [B1, K, N]
+        uint32_t reshape1_out_dims[] = {B1, K, N};
+        p_reshape1_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "reshape1_out",
+                                                       QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                                                       reshape1_out_dims, nullptr, 0);
 
-int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
-    BackendIdType backend_id = QNN_BACKEND_ID_NULL;
-    GGMLQNN_LOG_DEBUG("enter qni_init\n");
+        Qnn_Tensor_t reshape1_inputs[]  = {*p_permute1_out};
+        Qnn_Tensor_t reshape1_outputs[] = {*p_reshape1_out};
+        Qnn_OpConfig_t reshape1_op      = ggmlqnn_create_op_config("reshape1", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                                   QNN_OP_RESHAPE, nullptr, 0,
+                                                                   reshape1_inputs, 1, reshape1_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape1_op));
 
-    _device_id = QNN_BACKEND_GGML;
-    if (_backend_name.find("QnnCpu") != std::string::npos) {
-        _device_id = QNN_BACKEND_CPU;
-    }
-    if (_backend_name.find("QnnGpu") != std::string::npos) {
-        _device_id = QNN_BACKEND_GPU;
-    }
-    if (_backend_name.find("QnnHtp") != std::string::npos) {
-        _device_id = QNN_BACKEND_NPU;
-    }
-    if (QNN_BACKEND_GGML == _device_id) {
-        GGMLQNN_LOG_INFO("user specified qnn backend is ggml, skip QNN initialize");
-        return 0;
-    }
+        // MatMul: [B1, M, K] x [B1, K, N] -> [B1, M, N]
+        uint32_t matmul_out_dims[] = {B1, M, N};
+        p_matmul_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "matmul_out",
+                                                     QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
+                                                     matmul_out_dims, nullptr, 0);
 
-    if (0 != load_system()) {
-        GGMLQNN_LOG_WARN("can not load QNN system lib, pls check why?\n");
-        return 1;
-    } else {
-        GGMLQNN_LOG_DEBUG("load QNN system lib successfully\n");
-    }
+        Qnn_Tensor_t matmul_inputs[]    = {*p_tile0_out, *p_reshape1_out};
+        Qnn_Tensor_t matmul_outputs[]   = {*p_matmul_out};
+        Qnn_OpConfig_t matmul_op        = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                                   QNN_OP_MAT_MUL, nullptr, 0,
+                                                                   matmul_inputs, 2, matmul_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
 
-    std::string backend_lib_path = _lib_path + _backend_name;
+        // Output: [N, M, H1, B1] -> QNN: [B1, H1, M, N]
+        uint32_t reshape2_out_dims[] = {static_cast<uint32_t>(dst->ne[3]), static_cast<uint32_t>(dst->ne[2]),
+                                        static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[0])
+        };
+        p_reshape2_out = ggmlqnn_create_general_tensor(instance, graph_handle, dst, "output",
+                                                       QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
+                                                       reshape2_out_dims, nullptr, 0);
 
-    int is_load_ok = load_backend(backend_lib_path, saver_config);
-    if (0 != is_load_ok) {
-        GGMLQNN_LOG_WARN("failed to load QNN backend\n");
-        return 2;
-    }
+        Qnn_Tensor_t reshape2_inputs[]  = {*p_matmul_out};
+        Qnn_Tensor_t reshape2_outputs[] = {*p_reshape2_out};
+        Qnn_OpConfig_t reshape2_op      = ggmlqnn_create_op_config("reshape2", QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                                   QNN_OP_RESHAPE, nullptr, 0,
+                                                                   reshape2_inputs, 1, reshape2_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape2_op));
 
-    _qnn_interface.set_qnn_interface(_loaded_backend);
-#if 1
-    _qnn_interface.qnn_log_create(ggmlqnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle);
-#else
-    _qnn_raw_interface.logCreate(ggmlqnn_sdk_logcallback, _qnn_log_level, &_qnn_log_handle);
-#endif
-    if (nullptr == _qnn_log_handle) {
-        GGMLQNN_LOG_WARN("why failed to initialize qnn log\n"); //NPU backend not work on Qualcomm SoC based low-end phone
-        return 3;
-    } else {
-        GGMLQNN_LOG_DEBUG("initialize qnn log successfully\n");
-    }
+        // Finalize
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
 
-    std::vector<const QnnBackend_Config_t *> temp_backend_config;
-    _qnn_interface.qnn_backend_create(_qnn_log_handle,
-                      temp_backend_config.empty() ? nullptr : temp_backend_config.data(),
-                      &_qnn_backend_handle);
-    if (nullptr == _qnn_backend_handle) {
-        GGMLQNN_LOG_WARN("why failed to initialize qnn backend\n");
-        return 4;
-    } else {
-        GGMLQNN_LOG_DEBUG("initialize qnn backend successfully\n");
+        // Cache
+        qnn_ptensors_t ggml_op_mulmat_tensors = {p_tensor0, p_reshape0_out, p_tile0_out, p_tensor1,
+                                                 p_permute1_out, p_reshape1_out, p_matmul_out, p_reshape2_out
+        };
+        ctx->qnn_singlenode_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
     }
 
-    if (nullptr != _qnn_raw_interface.propertyHasCapability) {
-        auto qnnstatus = _qnn_raw_interface.propertyHasCapability(QNN_PROPERTY_GROUP_DEVICE);
-        if (QNN_PROPERTY_NOT_SUPPORTED == qnnstatus) {
-            GGMLQNN_LOG_WARN("device property is not supported\n");
-        }
-        if (QNN_PROPERTY_ERROR_UNKNOWN_KEY == qnnstatus) {
-            GGMLQNN_LOG_WARN("device property is not known to backend\n");
-        }
-    }
+    // Execute
+    QNN_VER_PTR(*p_tensor0)->clientBuf      = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
+    QNN_VER_PTR(*p_tensor1)->clientBuf      = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
+    QNN_VER_PTR(*p_reshape2_out)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
 
-    Qnn_ErrorHandle_t qnnstatus = QNN_SUCCESS;
-    if (_device_id == QNN_BACKEND_NPU) {
-        const QnnDevice_PlatformInfo_t * p_info = nullptr;
-        qcom_socinfo soc_info = {};
-        qnnstatus = _qnn_raw_interface.deviceGetPlatformInfo(nullptr, &p_info);
-        if (QNN_SUCCESS == qnnstatus) {
-            GGMLQNN_LOG_INFO("device counts %d\n", p_info->v1.numHwDevices);
-            QnnDevice_HardwareDeviceInfo_t *         infos    = p_info->v1.hwDevices;
-            QnnHtpDevice_OnChipDeviceInfoExtension_t chipinfo = {};
-            for (uint32_t i = 0; i < p_info->v1.numHwDevices; i++) {
-                GGMLQNN_LOG_INFO("deviceID:%d, deviceType:%d, numCores %d\n", (int) infos[i].v1.deviceId,
-                             (int) infos[i].v1.deviceType, (int) infos[i].v1.numCores);
-                QnnDevice_DeviceInfoExtension_t devinfo = infos[i].v1.deviceInfoExtension;
-                chipinfo                                = devinfo->onChipDevice;
-                size_t htp_arch                         = (size_t) chipinfo.arch;
-                GGMLQNN_LOG_INFO("htp_type:%d(%s)\n", devinfo->devType,
-                             (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : "");
-                soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize };
-            }
-            _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
-        } else {
-            GGMLQNN_LOG_WARN("failed to get platform info, are we in emulator?\n");
-            soc_info = { NONE, UNKNOWN_SM, 0 };
-        }
+    Qnn_Tensor_t input_tensors[]    = {*p_tensor0, *p_tensor1};
+    Qnn_Tensor_t output_tensors[]   = {*p_reshape2_out};
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2, output_tensors, 1, NULL, NULL));
 
-        QnnHtpDevice_CustomConfig_t soc_customconfig;
-        soc_customconfig.option    = QNN_HTP_DEVICE_CONFIG_OPTION_SOC;
-        soc_customconfig.socModel  = soc_info.soc_model;
-        QnnDevice_Config_t soc_devconfig;
-        soc_devconfig.option       = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
-        soc_devconfig.customConfig = &soc_customconfig;
+    op_perf.info();
+}
 
-        /*
-        QnnHtpDevice_CustomConfig_t arch_customconfig;
-        arch_customconfig.option        = QNN_HTP_DEVICE_CONFIG_OPTION_ARCH;
-        arch_customconfig.arch.arch     = (QnnHtpDevice_Arch_t)soc_info.htp_arch;
-        arch_customconfig.arch.deviceId = 0;
-        QnnDevice_Config_t arch_devconfig;
-        arch_devconfig.option       = QNN_DEVICE_CONFIG_OPTION_CUSTOM;
-        arch_devconfig.customConfig = &arch_customconfig;
-        */
-        const QnnDevice_Config_t * p_deviceconfig[] = { &soc_devconfig, nullptr };
-        qnnstatus = _qnn_raw_interface.deviceCreate(_qnn_log_handle, p_deviceconfig, &_qnn_device_handle);
-    } else {
-        qnnstatus = _qnn_interface.qnn_device_create(_qnn_log_handle, nullptr, &_qnn_device_handle);
-    }
-    if (QNN_SUCCESS != qnnstatus && QNN_DEVICE_ERROR_UNSUPPORTED_FEATURE != qnnstatus) {
-        GGMLQNN_LOG_WARN("failed to create QNN device\n");
-    } else {
-        GGMLQNN_LOG_INFO("create device successfully\n");
-    }
+/*
+ * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
+ *        using the QNN backend. this function performs matrix multiplication of the input tensor
+ *        `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
+ *        and stores the result in the destination tensor `dst`.
+ *
+         there are two key-points in properly handling how to offload mulmat to the QNN backend in ggml-qnn
+         1. transpose
+            a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from:
+            struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
+            which like this:
+            +---+---+
+            | 0 | 1 |
+            +---+---+
+            | 2 | 3 |
+            +---+---+
+            | 4 | 5 |
+            +---+---+
+            with
+                ne[0] = 2
+                ne[1] = 3
+            there are different dimension order between ggml tensor and qnn tensor
 
-    if (PROFILE_OFF != _profile_level) {
-        GGMLQNN_LOG_INFO("profiling turned on; level = %d", _profile_level);
-        if (PROFILE_BASIC == _profile_level) {
-            GGMLQNN_LOG_INFO("basic profiling requested. creating Qnn Profile object\n");
-            if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
-                    _qnn_backend_handle, QNN_PROFILE_LEVEL_BASIC, &_qnn_profile_handle)) {
-                GGMLQNN_LOG_WARN("unable to create profile handle in the backend\n");
-                return 5;
-            } else {
-                GGMLQNN_LOG_DEBUG("initialize qnn profile successfully\n");
-            }
-        } else if (PROFILE_DETAIL == _profile_level) {
-            GGMLQNN_LOG_INFO("detailed profiling requested. Creating Qnn Profile object\n");
-            if (QNN_PROFILE_NO_ERROR != _qnn_raw_interface.profileCreate(
-                    _qnn_backend_handle, QNN_PROFILE_LEVEL_DETAILED, &_qnn_profile_handle)) {
-                GGMLQNN_LOG_WARN("unable to create profile handle in the backend\n");
-                return 6;
-            } else {
-                GGMLQNN_LOG_DEBUG("initialize qnn profile successfully\n");
-            }
-        }
-    }
+          2. QNN's MatMul can only support input tensors with rank >= 2
 
-#if defined(__ANDROID__) || defined(__linux__)
-    std::filesystem::path full_path(std::string(g_qnn_params.qnn_runtimelib_path) + "libcdsprpc.so");
-    full_path /= std::filesystem::path("libcdsprpc.so").filename();
-    _rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
-    if (nullptr == _rpc_lib_handle) {
-        GGMLQNN_LOG_WARN("failed to load %s\n", full_path.c_str());
-        _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
-    }
-#else
-    _rpc_lib_handle = dlopen("libcdsprpc.dll", RTLD_NOW | RTLD_LOCAL);
-#endif
-    if (nullptr == _rpc_lib_handle) {
-        GGMLQNN_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror());
-        return 7;
-    } else {
-        GGMLQNN_LOG_DEBUG("load rpcmem lib successfully\n");
-        set_rpcmem_initialized(true);
-    }
-    _pfn_rpc_mem_init   = reinterpret_cast<pfn_rpc_mem_init>(dlsym(_rpc_lib_handle, "rpcmem_init"));
-    _pfn_rpc_mem_deinit = reinterpret_cast<pfn_rpc_mem_deinit>(dlsym(_rpc_lib_handle, "rpcmem_deinit"));
-    _pfn_rpc_mem_alloc  = reinterpret_cast<pfn_rpc_mem_alloc>(dlsym(_rpc_lib_handle,"rpcmem_alloc"));
-    _pfn_rpc_mem_free   = reinterpret_cast<pfn_rpc_mem_free>(dlsym(_rpc_lib_handle, "rpcmem_free"));
-    _pfn_rpc_mem_to_fd  = reinterpret_cast<pfn_rpc_mem_to_fd>(dlsym(_rpc_lib_handle,"rpcmem_to_fd"));
-    if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || nullptr == _pfn_rpc_mem_to_fd) {
-        GGMLQNN_LOG_WARN("unable to access symbols in QNN RPC lib, dlerror(): %s", dlerror());
-        dlclose(_rpc_lib_handle);
-        return 8;
-    }
+             in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
+             operation when offloading mulmat to QNN backend. this implementation will handle transpose
+             in func ggmlqnn_compute_create_general_tensor()
 
-    if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy
-        _pfn_rpc_mem_init();
+ * @param ctx     the context of backend
+ * @param op      the destination tensor where the result of the matrix multiplication will be stored.
+ *
+ * @note the logic of ggmlqnn_compute_mul_mat is similar to ggmlqnn_compute_op_two_tensors but much more complicated
+ *       than ggmlqnn_compute_op_two_tensors. so it's a standalone function. accordingly, this is another
+ *       typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute
+ *       time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds
+ *       of MUL_MAT to compute:
+ *       mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
+ *       mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
+ *       mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, Q6_K...)
+ *                        and src1 is F32, src0 -> f32 in src0', then src0' * src1
+*/
+static void ggmlqnn_compute_mul_mat(ggml_backend_hexagon_context * ctx, ggml_tensor * op) {
+    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
+    qnn_instance * instance                     = nullptr;
+    Qnn_GraphHandle_t graph_handle              = nullptr;
+    Qnn_Tensor_t * p_tensor0                    = nullptr;
+    Qnn_Tensor_t * p_tensor1                    = nullptr;
+    Qnn_Tensor_t * p_tensor2                    = nullptr;
+    Qnn_Tensor_t * p_param_tensor               = nullptr;
+    Qnn_Tensor_t * p_tensor2_transpose          = nullptr;
+    const ggml_tensor * src0                    = op->src[0];
+    const ggml_tensor * src1                    = op->src[1];
+    ggml_tensor       * dst                     = op;
 
-    std::vector<const QnnContext_Config_t *> temp_context_config;
-    _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle,
-                               temp_context_config.empty() ? nullptr : temp_context_config.data(),
-                               &_qnn_context_handle);
-    if (nullptr == _qnn_context_handle) {
-        GGMLQNN_LOG_WARN("why failed to initialize qnn context, error:%s\n", strerror(errno));
-        return 9;
-    } else {
-        GGMLQNN_LOG_DEBUG("initialize qnn context successfully\n");
-    }
+    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
+    instance                                    = ctx->instance;
+    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
 
-    if (_backend_name.find("Htp") != std::string::npos) {
-        htp_probe_rpc_meminfo();
+    const enum ggml_type src0_type              = src0->type;
+    const uint32_t src0_rank                    = ggml_n_dims(src0);
+    const uint32_t src1_rank                    = ggml_n_dims(src1);
 
-        if (0 != htp_init_perfinfra()) {
-            GGMLQNN_LOG_WARN("initialize HTP performance failure");
-        }
+    ggmlhexagon_print_tensors_info(__func__, ctx, src0, src1, dst);
 
-        htp_enter_performance_mode();
-        htp_set_memory_grow_size();
+    std::string graph_name;
+    ggmlhexagon_get_opkey_from_op(op, graph_name);
 
-        if (enable_qnn_rpc()) {
-            GGMLQNN_LOG_INFO("NPU RPC feature enabled with QNN-NPU backend");
-        } else {
-            GGMLQNN_LOG_INFO("NPU RPC feature disabled with QNN-NPU backend");
-        }
+    hexagon_perf op_perf(graph_name);
+    op_perf.start();
+
+    GGML_ASSERT(src0_rank == src1_rank);
+    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
+    if (4 == src0_rank) {
+        return ggmlqnn_compute_mul_mat_4d(ctx, op);
     }
 
-    print_backend_info();
+    void * wdata                                = ggmlhexagon_type_trait(ctx, op);
+    const size_t desired_size                   = ctx->desired_size;
 
-    GGMLQNN_LOG_DEBUG("leave qni_init\n");
+    if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
+        //retrieve computational resource from cached QNN graph
+        qnn_singlenode_res_t & graph_item = ctx->qnn_singlenode_graph_map[graph_name];
+        graph_handle = std::get<0>(graph_item);
+        qnn_ptensors_t &tensors = std::get<1>(graph_item);
+        p_tensor0 = tensors[0];
+        p_tensor1 = tensors[1];
+        p_tensor2 = tensors[2];
+        p_param_tensor = tensors[3];
+        p_tensor2_transpose = tensors[4];
+    } else {
+        //create QNN graph
+        GGMLHEXAGON_LOG_INFO("graph name %s", graph_name.c_str());
+        error = instance->init_qnn_graph(graph_name, static_cast<HEXAGONBackend>(ctx->device),
+                                         g_hexagon_appcfg.vtcm_size_in_mb,
+                                         g_hexagon_appcfg.hvx_threads);
+        if (QNN_SUCCESS != error) {
+            GGMLHEXAGON_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n",
+                                 graph_name.c_str(), error);
+            return;
+        }
+        graph_handle = instance->get_qnn_graph_handle();
 
-    return 0;
-}
+        //create computational tensor
+        p_tensor0 = ggmlqnn_create_general_tensor(instance, graph_handle, src0, nullptr,
+                                                  QNN_TENSOR_TYPE_APP_WRITE,
+                                                  QNN_DATATYPE_FLOAT_32, src0_rank,
+                                                  nullptr, nullptr, 0);
+        p_tensor1 = ggmlqnn_create_general_tensor(instance, graph_handle, src1, nullptr,
+                                                  QNN_TENSOR_TYPE_APP_WRITE,
+                                                  QNN_DATATYPE_FLOAT_32, src0_rank,
+                                                  nullptr, nullptr, 0);
+        p_tensor2 = ggmlqnn_create_general_tensor(instance, graph_handle, dst, nullptr,
+                                                  QNN_TENSOR_TYPE_APP_READ,
+                                                  QNN_DATATYPE_FLOAT_32, src0_rank,
+                                                  nullptr, nullptr, 0);
 
-int qnn_instance::qnn_finalize() {
-    int ret_status = 0;
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
+        //create param tensor for offload 2d/3d/4d matrix multiplication
+        const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
+                {0},
+                {1, 0},
+                {0, 2, 1},
+                {0, 1, 3, 2},
+        };
+        uint32_t param_tensor_dims[1] = {src0_rank};
+        p_param_tensor = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "param",
+                                                       QNN_TENSOR_TYPE_STATIC,
+                                                       QNN_DATATYPE_UINT_32, 1,
+                                                       param_tensor_dims,
+                                                       (void *) (param_tensor_data[src0_rank - 1]),
+                                                       src0_rank * sizeof(uint32_t));
 
-    GGMLQNN_LOG_INFO("enter %s\n", __func__);
-    ggmlqnn_reset_idx();
+        //create transpose tensor
+        p_tensor2_transpose = ggmlqnn_create_general_tensor(instance, graph_handle, dst,
+                                                            "transpose",
+                                                            QNN_TENSOR_TYPE_NATIVE,
+                                                            QNN_DATATYPE_FLOAT_32, src0_rank,
+                                                            nullptr, nullptr, 0, true);
 
-    free_rpcmem();
-    unregister_rpcmem();
+        //compose QNN graph: add mulmat node
+        Qnn_Param_t out_0_params[] = {
+                {QNN_PARAMTYPE_SCALAR, QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, .scalarParam = {
+                        QNN_DATATYPE_BOOL_8, .bool8Value = 1}}};
+        Qnn_Tensor_t out_0_inputs[] = {*p_tensor0, *p_tensor1};
+        Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
+        Qnn_OpConfig_t out_0 = ggmlqnn_create_op_config("mulmat_opconfig",
+                                                        QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                        QNN_OP_MAT_MUL, out_0_params, 1,
+                                                        out_0_inputs, 2, out_0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, out_0));
 
-    if (nullptr != _pfn_rpc_mem_deinit)
-        _pfn_rpc_mem_deinit();
+        //compose QNN graph: add transpose node
+        Qnn_Param_t out_trans1_0_params[] = {
+                {QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_param_tensor}};
+        Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
+        Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
+        Qnn_OpConfig_t out_trans1_0 = ggmlqnn_create_op_config("mulmat_transpose_opconfig",
+                                                               QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                               QNN_OP_TRANSPOSE,
+                                                               out_trans1_0_params, 1,
+                                                               out_trans1_0_inputs, 1,
+                                                               out_trans1_0_outputs, 1);
+        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, out_trans1_0));
 
-    if (0 != dlclose(_rpc_lib_handle)) {
-        GGMLQNN_LOG_WARN("failed to unload qualcomm's rpc lib, error:%s\n", dlerror());
-    } else {
-        GGMLQNN_LOG_DEBUG("succeed to close rpcmem lib\n");
-    }
-
-    if (nullptr != _qnn_context_handle) {
-        error = _qnn_interface.qnn_context_free(_qnn_context_handle, _qnn_profile_handle);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to free QNN context_handle: ID %u, error %d\n",
-                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
+        //finalize QNN graph
+        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
 
-        }
-        _qnn_context_handle = nullptr;
+        //cache QNN graph
+        qnn_ptensors_t ggml_op_mulmat_tensors;
+        ggml_op_mulmat_tensors.reserve(5);
+        ggml_op_mulmat_tensors.push_back(p_tensor0);
+        ggml_op_mulmat_tensors.push_back(p_tensor1);
+        ggml_op_mulmat_tensors.push_back(p_tensor2);
+        ggml_op_mulmat_tensors.push_back(p_param_tensor);
+        ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
+        auto graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+        ctx->qnn_singlenode_graph_map[graph_name] = graph_item;
     }
 
-    if (nullptr != _qnn_profile_handle) {
-        error = _qnn_interface.qnn_profile_free(_qnn_profile_handle);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to free QNN profile_handle: ID %u, error %d\n",
-                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
-
-        }
-        _qnn_profile_handle = nullptr;
+    if (src0_type != GGML_TYPE_F32) {
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
+    } else {
+        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
     }
+    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
+    QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
 
-    if (nullptr != _qnn_device_handle) {
-        error = _qnn_interface.qnn_device_free(_qnn_device_handle);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to free QNN device_handle: ID %u, error %d\n",
-                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
-
-        }
-        _qnn_device_handle = nullptr;
-    }
+    Qnn_Tensor_t tensor_inputs[] = {
+            *p_tensor0,
+            *p_tensor1
+    };
+    Qnn_Tensor_t tensor_outputs[] = {
+            *p_tensor2
+    };
+    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
+                                                        tensor_inputs, 2,
+                                                        tensor_outputs, 1,
+                                                        nullptr, nullptr));
+    op_perf.info();
+}
 
-    if (nullptr != _qnn_backend_handle) {
-        error = _qnn_interface.qnn_backend_free(_qnn_backend_handle);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to free QNN backend_handle: ID %u, error %d\n",
-                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
-        }
-        _qnn_backend_handle = nullptr;
+static void ggmlqnn_compute_repeat(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-    }
+static void ggmlqnn_compute_div(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-    if (nullptr != _qnn_log_handle) {
-        error = _qnn_interface.qnn_log_free(_qnn_log_handle);
-        if (error != QNN_SUCCESS) {
-            GGMLQNN_LOG_WARN("failed to free QNN log_handle: ID %u, error %d\n",
-                  _qnn_interface.get_backend_id(), QNN_GET_ERROR_CODE(error));
-        }
-        _qnn_log_handle = nullptr;
-    }
+static void ggmlqnn_compute_leaky_relu(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-    unload_backend();
-    unload_system();
+static void ggmlqnn_compute_concat(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-    GGMLQNN_LOG_INFO("leave %s\n", __func__);
-    return ret_status;
+static void ggmlqnn_compute_arange(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
-int qnn_instance::init_qnn_graph(const std::string & graph_name, QNNBackend device, size_t vtcm_size_in_mb, size_t hvx_threads) {
-    _graph_name = graph_name;
-    _device_id = device;
+static void ggmlqnn_compute_sqr(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-    GGMLQNN_LOG_DEBUG("[%s][%s]created", ggml_backend_qnn_get_devname(device), graph_name.c_str());
+static void ggmlqnn_compute_clamp(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    if (QNN_BACKEND_NPU == device) {
-        QnnHtpGraph_CustomConfig_t hvx_config;
-        hvx_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS;
-        hvx_config.numHvxThreads = hvx_threads;
-        QnnGraph_Config_t graph_hvx_config;
-        graph_hvx_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-        graph_hvx_config.customConfig = &hvx_config;
+static void ggmlqnn_compute_scale(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-        QnnHtpGraph_CustomConfig_t dlbc_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
-        dlbc_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
-        dlbc_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_ENABLE_DLBC;
-        if (0 == g_qnn_params.enable_dlbc)
-            dlbc_config.optimizationOption.floatValue = 0.0; // set to 0.0 to turn off DLBC
-        else
-            dlbc_config.optimizationOption.floatValue = 1.0; // set to 1.0 to turn on  DLBC
-        QnnGraph_Config_t graph_dlbc_config;
-        graph_dlbc_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-        graph_dlbc_config.customConfig = &dlbc_config;
+static void ggmlqnn_compute_argsort(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-        QnnHtpGraph_CustomConfig_t opt_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
-        opt_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_OPTIMIZATION;
-        opt_config.optimizationOption.type = QNN_HTP_GRAPH_OPTIMIZATION_TYPE_FINALIZE_OPTIMIZATION_FLAG;
-        opt_config.optimizationOption.floatValue = 1; // 1 / 3
-        QnnGraph_Config_t graph_opt_config;
-        graph_opt_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-        graph_opt_config.customConfig = &opt_config;
+static void ggmlqnn_compute_norm(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-        QnnHtpGraph_CustomConfig_t vtcm_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
-        vtcm_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_VTCM_SIZE;
-        vtcm_config.vtcmSizeInMB = vtcm_size_in_mb;
-        QnnGraph_Config_t graph_vtcm_config;
-        graph_vtcm_config.option = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-        graph_vtcm_config.customConfig = &vtcm_config;
+static void ggmlqnn_compute_group_norm(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-        std::vector<const QnnGraph_Config_t *> graph_configs;
-        graph_configs.push_back(&graph_hvx_config);
-        graph_configs.push_back(&graph_dlbc_config);
-        graph_configs.push_back(&graph_vtcm_config);
-        graph_configs.push_back(&graph_opt_config);
-        if (1 == g_qnn_params.precision_mode) {
-            QnnHtpGraph_CustomConfig_t fp16_config = QNN_HTP_GRAPH_CUSTOM_CONFIG_INIT;
-            fp16_config.option = QNN_HTP_GRAPH_CONFIG_OPTION_PRECISION;
-            fp16_config.precision = QNN_PRECISION_FLOAT16;
-            QnnGraph_Config_t graph_fp16_config;
-            graph_fp16_config.option       = QNN_GRAPH_CONFIG_OPTION_CUSTOM;
-            graph_fp16_config.customConfig = &fp16_config;
-            graph_configs.push_back(&graph_fp16_config);
-        }
-        graph_configs.push_back(nullptr);
-        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), graph_configs.data(), &_qnn_graph_handle);
-        GGMLQNN_LOG_DEBUG("[%s][%s]created graph %p", ggml_backend_qnn_get_devname(device), graph_name.c_str(), _qnn_graph_handle);
-    } else {
-        error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), nullptr, &_qnn_graph_handle);
-    }
-    if (QNN_SUCCESS != error) {
-        GGMLQNN_LOG_ERROR("[%s][%s]failed to create qnn graph, error: %s",
-                      ggml_backend_qnn_get_devname(device), graph_name.c_str(),
-                      ggmlqnn_get_qnnerror_string(error));
-        return error;
-    }
+static void ggmlqnn_compute_acc(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-    GGMLQNN_LOG_DEBUG("[%s]create graph %s succeed", ggml_backend_qnn_get_devname(device), graph_name.c_str());
-    if (QNN_BACKEND_NPU == device) {
-        htp_set_n_hvx_threads(hvx_threads);
-    }
-    return QNN_SUCCESS;
+static void ggmlqnn_compute_sum_rows(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
-int qnn_instance::init_qnn_graph(const char * graph_name, bool debug, uint8_t do_node_validation,
-                                 const QnnGraph_Config_t ** graph_configs) {
-    Qnn_ErrorHandle_t result = 0;
+static void ggmlqnn_compute_upsample_nearest2d(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-    if (nullptr == graph_name) {
-        GGMLQNN_LOG_WARN("graph name is null\n");
-        return 1;
-    }
+static void ggmlqnn_compute_pad(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-    if (!_graph_name.empty()) {
-        GGMLQNN_LOG_WARN("qnn model for graph %s already initialized\n", graph_name);
-        return 2;
-    }
+static void ggmlqnn_compute_pool2d(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-    if (!do_node_validation) {
-        GGMLQNN_LOG_WARN("node validation disabled, backend will not perform op validation prior to adding node\n");
-    }
+static void ggmlqnn_compute_dup(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-    _graph_name             = graph_name;
-    _debug_tensor           = debug;
-    _do_node_validations    = do_node_validation;
+static void ggmlqnn_compute_rms_norm(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-    result = _qnn_raw_interface.graphCreate(_qnn_context_handle,
-                                            graph_name,
-                                            graph_configs,
-                                            &_qnn_graph_handle);
-    if (QNN_GRAPH_NO_ERROR != result || nullptr == _qnn_graph_handle) {
-        GGMLQNN_LOG_WARN("failed to create graph in qnn context\n");
-        return 3;
-    } else {
-        GGMLQNN_LOG_DEBUG("succeed to create graph %s, %p\n", graph_name, _qnn_graph_handle);
-    }
+static void ggmlqnn_compute_diag_mask(ggml_backend_hexagon_context * ctx, ggml_tensor * dst, float value) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+    GGML_UNUSED(value);
+}
 
-    return 0;
+static void ggmlqnn_compute_im2col(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
-int qnn_instance::finalize_qnn_graph() {
-    if (nullptr != _qnn_graph_handle) {
-        if (_qnn_raw_interface.graphFinalize(_qnn_graph_handle,
-                                             _qnn_profile_handle, nullptr)
-                                             != QNN_GRAPH_NO_ERROR) {
-            GGMLQNN_LOG_WARN("finalizing graph failure\n");
-            return 1;
-        }
-    } else {
-        GGMLQNN_LOG_DEBUG("qnn graph handle is null\n");
-    }
+static void ggmlqnn_compute_timestep_embedding(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-    return 0;
+static void ggmlqnn_compute_cpy(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    ggmlqnn_compute_dup(ctx, dst);
 }
 
-int qnn_instance::htp_init_perfinfra() {
-    QnnDevice_Infrastructure_t device_infra = nullptr;
-    Qnn_ErrorHandle_t error = _qnn_raw_interface.deviceGetInfrastructure(&device_infra);
-    if (QNN_SUCCESS != error) {
-        GGMLQNN_LOG_WARN("failed to get qnn device infra\n");
-        return 1;
-    }
+static void ggmlqnn_compute_softmax(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-    QnnHtpDevice_Infrastructure_t * htp_infra = static_cast<QnnHtpDevice_Infrastructure_t *>(device_infra);
-    QnnHtpDevice_PerfInfrastructure_t * htp_perfinfra = &htp_infra->perfInfra;
-    uint32_t power_configid = 1;
-    uint32_t device_id      = 0;
-    uint32_t core_id        = 0;
-    htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid);
-    _qnn_htp_perfinfra      = htp_perfinfra;
-    _qnn_htp_powerconfig_id = power_configid;
-    //FIXME:hardcode to 0 and 0 although it's correct
-    _qnn_htp_device_id      = device_id;
-    _qnn_htp_core_id        = core_id;
+static void ggmlqnn_compute_get_rows(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
+}
 
-    return 0;
+static void ggmlqnn_compute_rope(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(dst);
 }
 
-void qnn_instance::htp_probe_rpc_meminfo() {
-    size_t candidate_size   = 0;
-    uint8_t * rpc_buffer    = nullptr;
-    const int SIZE_IN_MB    = (1 << 20);
-    size_t probe_slots[]    = {1024, 1536, 2048 - 48, 2048};
-    size_t probe_counts     = sizeof(probe_slots) / sizeof(size_t);
-    for (size_t idx = 0; idx < probe_counts; idx++) {
-        rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem_internal(probe_slots[idx] * SIZE_IN_MB, 4));
-        if (nullptr == rpc_buffer) {
-            GGMLQNN_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
-            break;
-        } else {
-            candidate_size = probe_slots[idx];
-            free_rpcmem(rpc_buffer);
-            rpc_buffer = nullptr;
-        }
+// =================================================================================================
+//  section-7: cDSP helper function
+// =================================================================================================
+static const char * ggmlhexagon_get_dsp_name(int domain_id) {
+    switch (domain_id) {
+        case HEXAGON_ADSP:
+            return "Hexagon-aDSP";
+        case HEXAGON_MDSP:
+            return "Hexagon-mDSP";
+        case HEXAGON_SDSP:
+            return "Hexagon-sDSP";
+        case HEXAGON_CDSP:
+            return "Hexagon-cDSP";
+        case HEXAGON_CDSP1:
+            return "Hexagon-cDSP1";
+        default:
+            return "Hexagon-unknown";
     }
-    if (candidate_size > _rpcmem_capacity)
-        _rpcmem_capacity = candidate_size;
+}
 
-    free_rpcmem();
-    _rpcmem_usage = 0;
-    GGMLQNN_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
+static int ggmlhexagon_pd_status_notifier_callback(void * context, int domain, int session, remote_rpc_status_flags_t status){
+    int error = AEE_SUCCESS;
+    switch (status){
+        case  FASTRPC_USER_PD_UP:
+            GGMLHEXAGON_LOG_DEBUG("PD is up\n");
+            break;
+        case  FASTRPC_USER_PD_EXIT:
+            GGMLHEXAGON_LOG_DEBUG("PD closed\n");
+            break;
+        case  FASTRPC_USER_PD_FORCE_KILL:
+            GGMLHEXAGON_LOG_DEBUG("PD force kill\n");
+            break;
+        case  FASTRPC_USER_PD_EXCEPTION:
+            GGMLHEXAGON_LOG_DEBUG("PD exception\n");
+            break;
+        case  FASTRPC_DSP_SSR:
+            GGMLHEXAGON_LOG_DEBUG("DSP SSR\n");
+            break;
+        default :
+            error =  AEE_EBADITEM;
+            break;
+    }
+    return error;
 }
 
-void qnn_instance::print_backend_info() {
-    auto print_property = [&](const char * name, QnnProperty_Key_t property) {
-        auto ret = _qnn_raw_interface.propertyHasCapability(property);
+static domain * ggmlhexagon_get_domain(int domain_id) {
+    int size = sizeof(hexagon_supported_domains) / sizeof(domain);
 
-        const char * status = "Unknown";
-        if (ret == QNN_PROPERTY_SUPPORTED) {
-            status = "Yes";
-        } else if (ret == QNN_PROPERTY_NOT_SUPPORTED) {
-            status = "No";
-        }
+    for (size_t i = 0; i < size; i++) {
+        if (hexagon_supported_domains[i].id == domain_id)
+            return &hexagon_supported_domains[i];
+    }
 
-        GGMLQNN_LOG_INFO("%s: %s", name, status);
-    };
+    return nullptr;
+}
 
-    GGMLQNN_LOG_INFO("QNN backend properties:");
-    print_property("Create context from binary list", QNN_PROPERTY_CONTEXT_SUPPORT_CREATE_FROM_BINARY_LIST_ASYNC);
-    print_property("Dynamic batch", QNN_PROPERTY_GRAPH_SUPPORT_BATCH_MULTIPLE);
-    print_property("Early termination", QNN_PROPERTY_GRAPH_SUPPORT_EARLY_TERMINATION);
-    print_property("Dynamic dimensions", QNN_PROPERTY_TENSOR_SUPPORT_DYNAMIC_DIMENSIONS);
-    print_property("Blockwise quantization", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCK);
-    print_property("Blockwise quantization with expansion", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_BLOCKWISE_EXPANSION);
-    print_property("Vector quantization", QNN_PROPERTY_TENSOR_SUPPORT_QUANTIZATION_ENCODING_VECTOR);
-    print_property("Tensor sparsity", QNN_PROPERTY_TENSOR_SUPPORT_SPARSITY);
-    print_property("Updateable application tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_APP_TENSORS);
-    print_property("Updateable native tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_NATIVE_TENSORS);
-    print_property("Updateable static tensor", QNN_PROPERTY_TENSOR_SUPPORT_UPDATEABLE_STATIC_TENSORS);
-    print_property("Qnn group device", QNN_PROPERTY_GROUP_DEVICE);
+static bool ggmlhexagon_is_cdsp(int domain_id) {
+    return (domain_id == HEXAGON_CDSP) || (domain_id == HEXAGON_CDSP1);
 }
 
-void qnn_instance::htp_set_memory_grow_size(size_t size) {
-    QnnHtpPerfInfrastructure_MemoryConfig_t grow_size_config = {
-            .option            = QNN_HTP_PERF_INFRASTRUCTURE_MEMORY_CONFIGOPTION_GROW_SIZE,
-            .memGrowSizeConfig = (uint32_t)size,
-    };
+static bool ggmlhexagon_is_valid_domain_id(int domain_id, int compute_only) {
+    int size = sizeof(hexagon_supported_domains) / sizeof(domain);
 
-    const QnnHtpPerfInfrastructure_MemoryConfig_t *memory_config[] = {
-            &grow_size_config,
-            nullptr,
-    };
-    Qnn_ErrorHandle_t result = _qnn_htp_perfinfra->setMemoryConfig(_qnn_htp_device_id, _qnn_htp_core_id, memory_config);
-    if (QNN_SUCCESS != result) {
-        GGMLQNN_LOG_WARN("failed to set HTP memory config");
-    } else {
-        GGMLQNN_LOG_INFO("succeed to set HTP memory config");
+    if (compute_only) {
+        return ggmlhexagon_is_cdsp(domain_id);
     }
-}
 
-void qnn_instance::htp_set_n_hvx_threads(size_t n_threads) {
-    QnnHtpGraph_CustomConfig_t htp_hvx_thread_config = {
-            .option        = QNN_HTP_GRAPH_CONFIG_OPTION_NUM_HVX_THREADS,
-            .numHvxThreads = n_threads,
-    };
+    for (size_t i = 0; i < size; i++) {
+        if (hexagon_supported_domains[i].id == domain_id)
+            return true;
+    }
 
-    QnnGraph_Config_t hvx_thread_config = {
-            .option       = QNN_GRAPH_CONFIG_OPTION_CUSTOM,
-            .customConfig = &htp_hvx_thread_config,
-    };
+    return false;
+}
 
-    const QnnGraph_Config_t * graph_configs[] = {&hvx_thread_config, nullptr};
-    Qnn_ErrorHandle_t result     = _qnn_raw_interface.graphSetConfig(_qnn_graph_handle, graph_configs);
-    if (QNN_SUCCESS != result) {
-        GGMLQNN_LOG_WARN("failed to set QNN graph config: set hvx threads %d", n_threads);
+static int ggmlhexagon_get_domains_info(const char * domain_type, int * num_domains, fastrpc_domain ** domains_info) {
+    int hexagon_err = AEE_SUCCESS;
+    int ss_info     = 0;
+    ss_info = strcmp(domain_type, "NSP")? HPASS: NSP;
+    system_req_payload req;
+    memset(&req, 0, sizeof(system_req_payload));
+    req.id = FASTRPC_GET_DOMAINS;
+    req.sys.domains = nullptr;
+    fastrpc_domain * domain = nullptr;
+
+    if (ss_info != 0) {
+        req.sys.flags = DOMAINS_LIST_FLAGS_SET_TYPE(req.sys.flags, ss_info);
     } else {
-        GGMLQNN_LOG_INFO("succeed to set QNN graph config: set hvx threads %d", n_threads);
+        req.sys.flags =0;
     }
-}
 
-void qnn_instance::htp_enter_performance_mode() {
-    QnnHtpPerfInfrastructure_PowerConfig_t dcvs_v3_config = {
-            .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_DCVS_V3,
-            .dcvsV3Config =
-                    {
-                            .contextId = _qnn_htp_powerconfig_id,
+#ifdef _WIN32
+    hexagon_err = AEE_EUNSUPPORTED;
+    goto bail;
+#endif
 
-                            .setDcvsEnable = 1,
-                            .dcvsEnable    = 0,
+    if (remote_system_request) {
+        hexagon_err = remote_system_request(&req);
+        if (hexagon_err != AEE_SUCCESS) {
+            GGMLHEXAGON_LOG_DEBUG("failure in remote_system_request call: %d", hexagon_err);
+            goto bail;
+        }
+        //allocate memory for domain-info array
+        req.sys.max_domains = req.sys.num_domains;
+        void * buffer = calloc(req.sys.num_domains, sizeof(fastrpc_domain));
+        if (nullptr == buffer) {
+            hexagon_err = AEE_ENOMEMORY;
+            GGMLHEXAGON_LOG_DEBUG("unable to allocate memory for req.sys.domains");
+            goto bail;
+        }
+        req.sys.domains = static_cast<fastrpc_domain *>(buffer);
+        hexagon_err = remote_system_request(&req);
+        if (hexagon_err != AEE_SUCCESS) {
+            GGMLHEXAGON_LOG_DEBUG("failure in remote_system_request call: %d.\n", hexagon_err);
+            goto bail;
+        }
 
-                            .powerMode = QNN_HTP_PERF_INFRASTRUCTURE_POWERMODE_PERFORMANCE_MODE,
+        for (int i = 0; i < req.sys.num_domains; i++) {
+            //verify that only requested type domains were returned
+            domain = &req.sys.domains[i];
+            if (domain->type != ss_info) {
+                hexagon_err = -1;
+                GGMLHEXAGON_LOG_DEBUG("incorrect data received from remote_system_request.\n");
+                goto bail;
+            }
+        }
+        *domains_info = req.sys.domains;
+        *num_domains  = req.sys.num_domains;
+    } else {
+        hexagon_err = AEE_EUNSUPPORTED;
+        goto bail;
+    }
 
-                            .setSleepLatency = 1,
-                            .sleepLatency    = 40,
+    bail:
+    if (hexagon_err && !req.sys.domains) {
+        free(req.sys.domains);
+    }
+    return hexagon_err;
+}
 
-                            .setSleepDisable = 1,
-                            .sleepDisable    = 1,
+static int ggmlhexagon_get_dsp_support(int * domain) {
+    int hexagon_error = AEE_SUCCESS;
+    *domain = HEXAGON_CDSP;
 
-                            .setBusParams           = 1,
-                            .busVoltageCornerMin    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
-                            .busVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
-                            .busVoltageCornerMax    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
+    if (remote_handle_control) {
+        struct remote_dsp_capability dsp_capability_domain = {HEXAGON_CDSP, DOMAIN_SUPPORT, 0};
+        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
+        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+            GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device");
+            goto bail;
+        }
 
-                            .setCoreParams           = 1,
-                            .coreVoltageCornerMin    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
-                            .coreVoltageCornerTarget = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
-                            .coreVoltageCornerMax    = DCVS_VOLTAGE_VCORNER_MAX_VOLTAGE_CORNER,
-                    },
-    };
+        if (0 == dsp_capability_domain.capability) {
+            dsp_capability_domain.domain       = HEXAGON_ADSP;
+            dsp_capability_domain.attribute_ID = DOMAIN_SUPPORT;
+            dsp_capability_domain.capability   = 0;
+            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
+            if(dsp_capability_domain.capability) {
+                *domain = HEXAGON_ADSP;
+            }
+        }
 
-    QnnHtpPerfInfrastructure_PowerConfig_t hmx_config = {
-            .option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_HMX_V2,
-            .hmxV2Config =
-                    {
-                            .hmxPickDefault         = 0,
-                            .hmxVoltageCornerMin    = DCVS_EXP_VCORNER_MAX,
-                            .hmxVoltageCornerTarget = DCVS_EXP_VCORNER_MAX,
-                            .hmxVoltageCornerMax    = DCVS_EXP_VCORNER_MAX,
-                            .hmxPerfMode            = QNN_HTP_PERF_INFRASTRUCTURE_CLK_PERF_HIGH,
-                    },
-    };
+        if (hexagon_error != AEE_SUCCESS) {
+            GGMLHEXAGON_LOG_DEBUG("get_dsp_support failed with error 0x%x", hexagon_error);
+            goto bail;
+        }
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
+    }
 
-    QnnHtpPerfInfrastructure_PowerConfig_t rpc_ctrl_config = {
-            .option                  = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY,
-            .rpcControlLatencyConfig = 100,
-    };
+    bail:
+    return hexagon_error;
+}
 
-    QnnHtpPerfInfrastructure_PowerConfig_t rpc_poll_config = {
-            .option               = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME,
-            .rpcPollingTimeConfig = 9999,
-    };
+static int ggmlhexagon_get_vtcm_info(int domain, uint32_t attr, uint32_t * capability) {
+    int hexagon_error = AEE_SUCCESS;
+    *capability = 0;
 
-    const QnnHtpPerfInfrastructure_PowerConfig_t * power_configs[] = {
-            &dcvs_v3_config,
-            &hmx_config,
-            &rpc_ctrl_config,
-            &rpc_poll_config,
-            nullptr,
-    };
-    Qnn_ErrorHandle_t ret = _qnn_htp_perfinfra->setPowerConfig(_qnn_htp_powerconfig_id, power_configs);
-    if (ret != QNN_SUCCESS) {
-        GGMLQNN_LOG_WARN("failed to set HTP power config");
+    if (attr == VTCM_PAGE || attr == VTCM_COUNT) {
     } else {
-        GGMLQNN_LOG_INFO("succeed to set HTP power config");
+        hexagon_error = AEE_EBADPARM;
+        GGMLHEXAGON_LOG_DEBUG("unsupported attr, only VTCM_PAGE and VTCM_COUNT supported");
+        goto bail;
     }
-}
 
-static void ggmlqnn_set_runtime_path(size_t device, const std::string & path) {
-    if ((QNN_BACKEND_NPU == device) || (HWACCEL_CDSP == g_qnn_params.hwaccel_approach)) {
-        if (0 == setenv("LD_LIBRARY_PATH",
-                        (path +
-                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
-                        1)) {
-            GGMLQNN_LOG_INFO("QNN NPU backend setenv successfully");
-        } else {
-            GGMLQNN_LOG_ERROR("QNN NPU backend setenv failure");
-        }
-        if (0 == setenv("ADSP_LIBRARY_PATH",
-                        (path +
-                         ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(),
-                        1)) {
-            GGMLQNN_LOG_INFO("QNN NPU backend setenv successfully");
+    if (remote_handle_control) {
+        if (domain == HEXAGON_ADSP || domain == HEXAGON_CDSP) {
+            /*
+            * query the DSP for VTCM information
+            * since the ADSP does not have a dedicated VTCM, we expect the output to be 0
+            */
+            struct remote_dsp_capability dsp_capability_vtcm_dsp;
+            dsp_capability_vtcm_dsp.domain       = (uint32_t)domain;
+            dsp_capability_vtcm_dsp.attribute_ID = attr;
+            dsp_capability_vtcm_dsp.capability   = (uint32_t)0;
+            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp, sizeof(struct remote_dsp_capability));
+            if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device");
+                GGMLHEXAGON_LOG_DEBUG("running the use case without checking the capability");
+                hexagon_error = AEE_SUCCESS;
+                goto bail;
+            } else if (hexagon_error == AEE_SUCCESS) {
+                *capability = dsp_capability_vtcm_dsp.capability;
+            } else {
+                GGMLHEXAGON_LOG_DEBUG("get_vtcm_info failed with error 0x%x", hexagon_error);
+                goto bail;
+            }
         } else {
-            GGMLQNN_LOG_ERROR("QNN NPU backend setenv failure");
+            hexagon_error = AEE_EUNSUPPORTED;
+            GGMLHEXAGON_LOG_DEBUG("unsupported domain %d", domain);
+            goto bail;
         }
     } else {
-        if (0 == setenv("LD_LIBRARY_PATH",
-                        (path +
-                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
-                        1)) {
-            GGMLQNN_LOG_INFO("%s backend setenv successfully\n",
-                             ggml_backend_qnn_get_devname(device));
-        } else {
-            GGMLQNN_LOG_ERROR("%s backend setenv failure\n",
-                              ggml_backend_qnn_get_devname(device));
-        }
-    }
-}
-
-static uint8_t * ggmlqnn_create_rpc_buffer(qnn_instance * instance, const ggml_tensor * ggml_tensor, Qnn_Tensor_t * qnn_tensor, bool b_copydata) {
-    if (nullptr == instance || nullptr == ggml_tensor || nullptr == qnn_tensor) {
-        GGMLQNN_LOG_WARN("invalid params\n");
-        return nullptr;
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
     }
 
-    uint8_t * qnn_rpcbuffer = static_cast<uint8_t *>(instance->alloc_rpcmem(ggml_nbytes(ggml_tensor), 4));
-    if (nullptr == qnn_rpcbuffer) {
-        GGMLQNN_LOG_WARN("alloc rpcmem failure, %s\n", strerror(errno));
-        return nullptr;
-    } else {
-        GGMLQNN_LOG_DEBUG("alloc rpcmem %p successfully\n", qnn_rpcbuffer);
-    }
-    if (b_copydata)
-        memcpy(qnn_rpcbuffer, ggml_tensor->data, ggml_nbytes(ggml_tensor));
-    instance->register_rpcmem(qnn_rpcbuffer, qnn_tensor);
-    return qnn_rpcbuffer;
+    bail:
+    return hexagon_error;
 }
 
-static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * package, const char * type,
-                                               Qnn_Param_t * params, uint32_t num_params,
-                                               Qnn_Tensor_t * inputs, uint32_t num_inputs,
-                                               Qnn_Tensor_t * outputs, uint32_t num_outputs) {
+static bool ggmlhexagon_is_unsignedpd_supported(int domain_id) {
+    int hexagon_error = AEE_SUCCESS;
+    if (remote_handle_control) {
+        struct remote_dsp_capability dsp_capability_domain = {static_cast<uint32_t>(domain_id), UNSIGNED_PD_SUPPORT, 0};
+        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
+        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+            GGMLHEXAGON_LOG_WARN("FastRPC Capability API is not supported on this device. Falling back to signed pd");
+            return false;
+        }
 
-    char opcfg_name[GGML_MAX_NAME] = {};
+        if (hexagon_error) {
+            GGMLHEXAGON_LOG_WARN("error 0x%x: FastRPC Capability API failed. falling back to signed pd", hexagon_error);
+            return false;
+        }
 
-    //ensure the opcfg name is unique
-    if (nullptr == name) {
-        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%-8d", ggmlqnn_get_idx(QNN_OPCFG_INDEX));
+        if (dsp_capability_domain.capability == 1) {
+            return true;
+        }
     } else {
-        snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%s_%-8d", name, ggmlqnn_get_idx(QNN_OPCFG_INDEX));
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device.falling back to signed pd");
+        return false;
     }
-    GGMLQNN_LOG_DEBUG("create qnn opconfig %s", opcfg_name);
-    ggmlqnn_inc_idx(QNN_OPCFG_INDEX);
 
-    Qnn_OpConfigV1_t v1 = {opcfg_name, package, type,
-                           num_params, params,
-                           num_inputs, inputs,
-                           num_outputs, outputs
-    };
-    Qnn_OpConfig_t opcfg = {QNN_OPCONFIG_VERSION_1, {v1}};
+    return false;
+}
 
-    return opcfg;
+static bool ggmlhexagon_get_unsignedpd_support(void) {
+    return ggmlhexagon_is_unsignedpd_supported(HEXAGON_CDSP);
 }
 
-static Qnn_Tensor_t * ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
-                                                    const ggml_tensor * tensor, const char * name,
-                                                    Qnn_TensorType_t qnn_tensor_type,
-                                                    Qnn_DataType_t qnn_data_type,
-                                                    uint32_t rank, uint32_t * dims,
-                                                    void * data, uint32_t data_size,
-                                                    bool b_transpose = false) {
-    Qnn_ErrorHandle_t error         = QNN_SUCCESS;
-    char tensor_name[GGML_MAX_NAME] = {};
+static bool ggmlhexagon_is_async_fastrpc_supported(int domain) {
+    int hexagon_error = AEE_SUCCESS;
+    if (remote_handle_control) {
+        if (domain == HEXAGON_CDSP) {
+            /*
+            * Query the DSP for ASYNC_FASTRPC_SUPPORT information
+            * Async fastrpc is supported only on CDSP
+            */
+            struct remote_dsp_capability dsp_capability_async_support;
+            dsp_capability_async_support.domain       = (uint32_t)domain;
+            dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT;
+            dsp_capability_async_support.capability   = (uint32_t)0;
+            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support, sizeof(struct remote_dsp_capability));
+            if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                GGMLHEXAGON_LOG_WARN("FastRPC Capability API is not supported on this device");
+                hexagon_error = AEE_SUCCESS;
+                goto bail;
+            } else if (dsp_capability_async_support.capability == 1) {
+                return true;
+            }
 
-    //ensure the tensor name is unique
-    if (nullptr == name) {
-        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%-8d", ggmlqnn_get_idx(QNN_TENSOR_INDEX));
+            if (hexagon_error != AEE_SUCCESS){
+                GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error);
+                goto bail;
+            }
+        } else {
+            hexagon_error = AEE_EUNSUPPORTED;
+            GGMLHEXAGON_LOG_WARN("async FastRPC is not supported on domain %d", domain);
+            goto bail;
+        }
     } else {
-        snprintf(tensor_name, GGML_MAX_NAME, "tensor_%s%-8d", name, ggmlqnn_get_idx(QNN_TENSOR_INDEX));
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device");
     }
-    GGMLQNN_LOG_DEBUG("init_tensor %s", tensor_name);
-    ggmlqnn_inc_idx(QNN_TENSOR_INDEX);
 
-    uint32_t reverse_dims[GGML_MAX_DIMS]    = {};
-    uint32_t transpose_dims[GGML_MAX_DIMS]  = {};
-    uint32_t * tensor_dims                  = nullptr;
-    //case 1:use dims info from ggml tensor
-    if (nullptr != tensor) {
-        //there are different dimension order between ggml tensor and qnn tensor
-        for (size_t idx = 0; idx < rank; idx++) {
-            reverse_dims[idx] = (uint32_t)tensor->ne[rank - 1 - idx];
+    bail:
+    return false;
+}
+
+static void ggmlhexagon_set_rpc_latency(int domain, int qos, int latency) {
+    int hexagon_error = AEE_SUCCESS;
+
+    if (remote_handle_control) {
+        struct remote_rpc_control_latency data;
+/*
+        qos          |  latency
+        -----------------------
+        RPC_PM_QOS   |  300
+        RPC_POLL_QOS |  1000
+*/
+        data.enable   = qos;
+        data.latency  = latency;
+        hexagon_error = remote_handle64_control(DSPRPC_GET_DSP_INFO, DSPRPC_CONTROL_LATENCY, (void*)&data, sizeof(data));
+        if (hexagon_error != AEE_SUCCESS) {
+            //FIXME: why set rpc latency failure
+            GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error);
+            goto bail;
+        } else {
+            GGMLHEXAGON_LOG_INFO("set rpc qos %d, latency %d\n", qos, latency);
         }
-        tensor_dims = reverse_dims;
-    }
-    //case 2: use user's specified tensor_dims
-    if (nullptr != dims) {
-        tensor_dims = dims;
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device");
     }
-    //case 3: transpose for dst tensor
-    if (b_transpose) {
-        GGML_ASSERT(tensor != nullptr); //ensure ggml_tensor is not nullptr for this special case
 
-        ggmlqnn_get_qnn_dimensions_from_ggml_dimensions(transpose_dims, reverse_dims, ggml_n_dims(tensor));
-        tensor_dims = transpose_dims;
-    }
+    bail:
+    return;
+}
 
-    Qnn_Tensor_t qnn_tensor = {
-            .version= QNN_TENSOR_VERSION_1,
-            {.v1= {
-                    .id = 0,
-                    .name = tensor_name,
-                    .type = qnn_tensor_type,
-                    .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
-                    .dataType = qnn_data_type,
-                    .quantizeParams = {.encodingDefinition = QNN_DEFINITION_UNDEFINED,
-                            .quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED,
-                            {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}},
-                    .rank = rank,
-                    .dimensions = tensor_dims,
-                    .memType = QNN_TENSORMEMTYPE_RAW,
-                    .clientBuf = {.data = nullptr, .dataSize = 0}
-            }
-            }
-    };
+static bool ggmlhexagon_is_status_notification_supported(int domain) {
+    int hexagon_error = AEE_SUCCESS;
 
-    Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
-    if (nullptr == p_qnn_tensor) {
-        GGMLQNN_LOG_WARN("calloc failed");
-        return nullptr;
-    }
-    error = deep_copy_qnn_tensors(qnn_tensor, * p_qnn_tensor);
-    if (error != QNN_SUCCESS) {
-        free(p_qnn_tensor);
-        GGMLQNN_LOG_WARN("init tensor failed");
-        return  nullptr;
-    }
+    if (remote_handle_control) {
+        /*
+        * Query the DSP for STATUS_NOTIFICATION_SUPPORT information
+        * DSP User PD status notification Support
+        */
+        struct remote_dsp_capability dsp_capability_status_notification_support;
+        dsp_capability_status_notification_support.domain       = (uint32_t)domain;
+        dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT;
+        dsp_capability_status_notification_support.capability   = (uint32_t)0;
+        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support, sizeof(struct remote_dsp_capability));
+        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+            GGMLHEXAGON_LOG_WARN("FastRPC Capability API is not supported on this device");
+            hexagon_error = AEE_SUCCESS;
+            goto bail;
+        } else if (1 == dsp_capability_status_notification_support.capability) {
+            return true;
+        }
 
-    bool enable_npu_rpc = (instance->enable_qnn_rpc() && instance->get_device_id() == QNN_BACKEND_NPU);
-    if (enable_npu_rpc) {
-        QNN_VER_PTR(*p_qnn_tensor)->memType = QNN_TENSORMEMTYPE_MEMHANDLE;
-        QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {.data=nullptr, .dataSize=0};
+        if (hexagon_error != AEE_SUCCESS){
+            GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error);
+            goto bail;
+        }
     } else {
-        QNN_VER_PTR(*p_qnn_tensor)->clientBuf = {data, data_size};
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device");
     }
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = instance->get_qnn_raw_interface();
-    CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_qnn_tensor));
 
-    return p_qnn_tensor;
+    bail:
+    return false;
 }
 
-static Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
-                          const ggml_tensor * tensor, Qnn_TensorType_t tensor_type) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
-    uint32_t dimensions[]   = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
-                               (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
-    Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
-    Qnn_TensorType_t qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
+static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t attr, uint32_t * capability) {
+    int hexagon_error = AEE_SUCCESS;
+    *capability = 0;
 
-    if (0 == tensor->flags) {
-        qnn_tensor_type = tensor_type;
-    } else {
-        if (tensor->flags & GGML_TENSOR_FLAG_INPUT) {
-            qnn_tensor_type = QNN_TENSOR_TYPE_APP_WRITE;
-        } else if (tensor->flags & GGML_TENSOR_FLAG_OUTPUT) {
-            qnn_tensor_type = QNN_TENSOR_TYPE_APP_READ;
-        }
+    if (attr != HMX_SUPPORT_SPATIAL && attr != HMX_SUPPORT_DEPTH) {
+        hexagon_error = AEE_EBADPARM;
+        GGMLHEXAGON_LOG_WARN("unsupported attr, only HMX_SUPPORT_SPATIAL and HMX_SUPPORT_DEPTH supported");
+        goto bail;
     }
 
-    qnn_data_type = ggmlqnn_datatype_from_ggml_datatype(tensor->type);
-    Qnn_Tensor_t * p_qnn_tensor = ggmlqnn_create_general_tensor(instance, graph_handle, tensor, nullptr,
-                                      qnn_tensor_type, qnn_data_type,
-                                      ggml_n_dims(tensor), dimensions,
-                                      nullptr, 0);
-    return p_qnn_tensor;
-}
-
-static void ggmlqnn_load_cfg() {
-    //this function can be called in various scenarios
-    static bool initialized = false;
-    if (initialized) {
-        GGMLQNN_LOG_INFO("qnn cfg file already loadded\n");
-        return;
-    }
-    char time_string[GGML_QNN_TMPBUF_LEN];
-    memset(time_string, 0, GGML_QNN_TMPBUF_LEN);
-    ggmlqnn_get_timestring(time_string);
-    GGMLQNN_LOG_DEBUG("program running start time:%s", time_string);
-    std::string cfg_filename = std::string(g_qnn_params.qnn_runtimelib_path) + std::string(g_qnn_params.qnn_cfgfilename);
-    GGMLQNN_LOG_INFO("load ggml-qnn config from %s", cfg_filename.c_str());
-    qnn_cfg qnncfg_instance;
-    qnncfg_instance.load(cfg_filename);
-    qnncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
-        std::ostringstream  tmposs;
-        tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]" << std::endl;
-        GGMLQNN_LOG_INFO("%s", tmposs.str().c_str());
-    });
-    std::string precision_mode;
-    qnncfg_instance.get_intvalue("general", "print_qnn_internal_log", g_qnn_params.print_qnn_internal_log, 0);
-    qnncfg_instance.get_intvalue("general", "enable_perf", g_qnn_params.enable_perf, 0);
-    qnncfg_instance.get_intvalue("general", "print_tensors_info", g_qnn_params.print_tensors_info, 0);
-    qnncfg_instance.get_intvalue("general", "dump_op_info", g_qnn_params.dump_op_info, 0);
-    qnncfg_instance.get_intvalue("general", "hwaccel_approach", g_qnn_params.hwaccel_approach, 0);
-    qnncfg_instance.get_intvalue("general", "qnn_backend", g_qnn_params.qnn_backend, 2);
-    qnncfg_instance.get_intvalue("qnn", "hvx_threads", g_qnn_params.hvx_threads, 4);
-    qnncfg_instance.get_intvalue("qnn", "vtcm_size_in_mb", g_qnn_params.vtcm_size_in_mb, 8);
-    qnncfg_instance.get_intvalue("qnn", "enable_dlbc", g_qnn_params.enable_dlbc, 0);
-    qnncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32");
-    qnncfg_instance.get_intvalue("cdsp", "enable_mulmat_cdsp", g_qnn_params.enable_mulmat_cdsp, 0);
-    qnncfg_instance.get_intvalue("cdsp", "enable_q_mulmat", g_qnn_params.enable_q_mulmat, 0);
-    GGMLQNN_LOG_INFO("print_qnn_internal_log=%d", g_qnn_params.print_qnn_internal_log);
-    GGMLQNN_LOG_INFO("hwaccel_approach=%d(%s)", g_qnn_params.hwaccel_approach,
-                     ggmlqnn_get_hwaccel_approach_name(g_qnn_params.hwaccel_approach));
-    GGMLQNN_LOG_INFO("qnn_backend=%d", g_qnn_params.qnn_backend);
-    GGMLQNN_LOG_INFO("npu inference precision mode=%s", precision_mode.c_str());
-    GGMLQNN_LOG_INFO("qnn runtime lib path=%s", g_qnn_params.qnn_runtimelib_path);
-    if (precision_mode.find("fp16") != std::string::npos) {
-        g_qnn_params.precision_mode = 1;
+    if (remote_handle_control) {
+        if (domain == HEXAGON_CDSP) {
+            /*
+            * Query the DSP for HMX SUPPORT information
+            * HMX is supported on CDSP only
+            */
+            struct remote_dsp_capability dsp_capability_hmx_dsp;
+            dsp_capability_hmx_dsp.domain       = (uint32_t)domain;
+            dsp_capability_hmx_dsp.attribute_ID = attr;
+            dsp_capability_hmx_dsp.capability   = (uint32_t)0;
+            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp, sizeof(struct remote_dsp_capability));
+            if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device");
+                hexagon_error = AEE_SUCCESS;
+                goto bail;
+            }
+            else if (hexagon_error == AEE_SUCCESS) {
+                *capability = dsp_capability_hmx_dsp.capability;
+            } else {
+                GGMLHEXAGON_LOG_DEBUG("get_hmx_support_info failed with Error 0x%x", hexagon_error);
+                goto bail;
+            }
+        } else {
+            hexagon_error = AEE_EUNSUPPORTED;
+            GGMLHEXAGON_LOG_DEBUG("HMX support is not there for domain %d", domain);
+            goto bail;
+        }
     } else {
-        g_qnn_params.precision_mode = 0;
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
     }
-    initialized = true;
+
+    bail:
+    return hexagon_error;
 }
 
-// =================================================================================================
-//  section-8: implementation of ggml-hexagon backend according to ggml backend subsystem
-// =================================================================================================
-static bool ggmlqnn_same_types(const ggml_backend_qnn_context * ctx, const ggml_tensor * op_tensor) {
-    GGML_UNUSED(ctx);
-    ggml_tensor * src0 = op_tensor->src[0];
-    ggml_tensor * src1 = op_tensor->src[1];
-    if (nullptr != src1) {
-        if (src0->type != op_tensor->type || src1->type != op_tensor->type) {
-            return false;
+static int ggmlhexagon_get_hvx_arch_ver(int domain, uint32_t * capability) {
+    int hexagon_error = AEE_SUCCESS;
+    *capability = 0;
+    if(remote_handle_control) {
+        /*
+        * Query the Hexagon processor architecture version information
+        */
+        struct remote_dsp_capability dsp_capability_arch_ver;
+        dsp_capability_arch_ver.domain       = (uint32_t)domain;
+        dsp_capability_arch_ver.attribute_ID = ARCH_VER;
+        dsp_capability_arch_ver.capability   = (uint32_t)0;
+        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_arch_ver, sizeof(struct remote_dsp_capability));
+        if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
+            GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device");
+            hexagon_error = AEE_SUCCESS;
+            goto bail;
+        } else if (hexagon_error == AEE_SUCCESS) {
+            *capability = dsp_capability_arch_ver.capability & 0xFF;
+        } else {
+            GGMLHEXAGON_LOG_DEBUG("get_hex_arch_ver failed with error 0x%x", hexagon_error);
+            goto bail;
         }
     } else {
-        if (src0->type != op_tensor->type) {
-            return false;
-        }
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
     }
 
-    if (src0->type != GGML_TYPE_F32)
-        return false;
-
-    return true;
+    bail:
+    return hexagon_error;
 }
 
-static bool ggmlhexagon_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * op_tensor) {
-    ggmlqnn_dump_op_info(op_tensor);
-    if (!ggmlhexagon_k_op_caps[ggmlqnn_get_op_index(op_tensor)].supported) {
-        return false;
-    }
-
-    struct ggml_tensor * src0 = op_tensor->src[0];
-    struct ggml_tensor * src1 = op_tensor->src[1];
-    const int64_t ne00  = op_tensor->src[0]->ne[0];
-    uint32_t src0_rank  = ggml_n_dims(src0);
-    uint32_t src1_rank  = 0;
-    if (nullptr != src1) {
-        src1_rank = ggml_n_dims(src1);
+static int ggmlhexagon_get_hvx_support_info(int domain, uint32_t attr, uint32_t * capability)
+{
+    int hexagon_error = AEE_SUCCESS;
+    *capability = 0;
+    if (attr == HVX_SUPPORT_64B) {
+        hexagon_error = AEE_EBADPARM;
+        GGMLHEXAGON_LOG_DEBUG("latest targets have 128 byte HVX register, use HVX_SUPPORT_128B instead of HVX_SUPPORT_64B");
+        goto bail;
     }
 
-    //available in the early stage, should be removed in the product stage
-    bool support = false;
-    if (g_qnn_params.enable_mulmat_cdsp)
-        support = ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
-    else
-        support = (op_tensor->op == GGML_OP_ADD);
-    if (!support) {
-        return false;
+    if (attr != HVX_SUPPORT_128B) {
+        hexagon_error = AEE_EBADPARM;
+        GGMLHEXAGON_LOG_DEBUG("unsupported attr. only HVX_SUPPORT_128B supported");
+        goto bail;
     }
 
-    switch (op_tensor->op) {
-        case GGML_OP_ADD:
-        case GGML_OP_SUB:
-        {
-            if (!ggml_are_same_shape(src0, src1)) {
-                return false;
+    if (remote_handle_control) {
+        if (domain == HEXAGON_CDSP) {
+            /*
+            * Query the DSP for HVX SUPPORT information
+            * HVX is supported on CDSP only
+            */
+            struct remote_dsp_capability dsp_capability_hvx_dsp;
+            dsp_capability_hvx_dsp.domain       = (uint32_t)domain;
+            dsp_capability_hvx_dsp.attribute_ID = attr;
+            dsp_capability_hvx_dsp.capability   = (uint32_t)0;
+            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp, sizeof(struct remote_dsp_capability));
+            if ((hexagon_error & 0xFF)==(AEE_EUNSUPPORTEDAPI & 0xFF)) {
+                GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device");
+                hexagon_error = AEE_SUCCESS;
+                goto bail;
+            } else if (hexagon_error == AEE_SUCCESS) {
+                *capability = dsp_capability_hvx_dsp.capability;
+            } else {
+                GGMLHEXAGON_LOG_DEBUG("failed with error 0x%x", hexagon_error);
+                goto bail;
             }
-            break;
+        } else {
+            hexagon_error = AEE_EUNSUPPORTED;
+            GGMLHEXAGON_LOG_DEBUG("HVX support is not available on domain %d", domain);
+            goto bail;
         }
-        case GGML_OP_MUL_MAT:
-        {
-            ggmlqnn_dump_op_info(op_tensor);
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
+        GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
+    }
 
-            //TODO:3d&4d matrix mulmat on cDSP
-            if (src0_rank != 2)
-                return false;
+    bail:
+    return hexagon_error;
+}
 
-            if (g_qnn_params.enable_q_mulmat)
-                return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type))
-                       && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
-            else
-                return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+static int ggmlhexagon_request_status_notifications(int domain_id, void * context, notify_callback_fn call_back_fn) {
+    int hexagon_error = AEE_SUCCESS;
+    struct remote_rpc_notif_register notif;
+    bool status_notification_support;
+
+    notif.context     = context;
+    notif.domain      = domain_id;
+    notif.notifier_fn = call_back_fn;
+
+    status_notification_support = ggmlhexagon_is_status_notification_supported(domain_id);
+    if (status_notification_support) {
+        hexagon_error = remote_session_control(FASTRPC_REGISTER_STATUS_NOTIFICATIONS, (void*)&notif, sizeof(notif));
+        if (hexagon_error != AEE_SUCCESS) {
+            GGMLHEXAGON_LOG_DEBUG("error 0x%x: remote_session_control failed to enable status notifications", hexagon_error);
         }
-        default:
-            break;
+    } else {
+        hexagon_error = AEE_EUNSUPPORTEDAPI;
     }
-    return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+
+    return hexagon_error;
 }
 
-static bool ggmlqnn_can_handle_op(const ggml_backend_qnn_context * ctx, const struct ggml_tensor * op_tensor) {
-    if (op_tensor->op == GGML_OP_NONE) {
-        return true;
+static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx, size_t * rpcmem_capacity) {
+    size_t candidate_size   = 0;
+    uint8_t * rpc_buffer    = nullptr;
+    const int SIZE_IN_MB    = (1 << 20);
+    size_t probe_slots[]    = {1024, 1536, 2048 - 48, 2048};
+    size_t probe_counts     = sizeof(probe_slots) / sizeof(size_t);
+    for (size_t idx = 0; idx < probe_counts; idx++) {
+        rpc_buffer = static_cast<uint8_t *>(rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB)));
+        if (nullptr == rpc_buffer) {
+            GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
+            break;
+        } else {
+            candidate_size = probe_slots[idx];
+            rpcmem_free(rpc_buffer);
+            rpc_buffer = nullptr;
+        }
     }
 
-    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
-        return ggmlhexagon_can_handle_op(ctx, op_tensor);
-    }
+    *rpcmem_capacity = candidate_size;
+    GGMLHEXAGON_LOG_INFO("capacity of rpc ion memory %d MB", *rpcmem_capacity);
 
-    if (!ggmlqnn_k_op_caps[ggmlqnn_get_op_index(op_tensor)].supported) {
-        return false;
-    }
+    uint32_t dsp_version = 0;
+    ggmlhexagon_get_hvx_arch_ver(ctx->domain_id, &dsp_version);
 
-    struct ggml_tensor * src0 = op_tensor->src[0];
-    struct ggml_tensor * src1 = op_tensor->src[1];
-    const int64_t ne00  = op_tensor->src[0]->ne[0];
-    uint32_t src0_rank  = ggml_n_dims(src0);
-    uint32_t src1_rank  = 0;
-    if (nullptr != src1) {
-        src1_rank = ggml_n_dims(src1);
+    if (dsp_version == 0x68 || dsp_version == 0x69 || dsp_version == 0x73 || dsp_version == 0x75 || dsp_version == 0x79) {
+        GGMLHEXAGON_LOG_INFO("dsp arch version 0x%x", dsp_version);
+        //0x68 -> 68, 0x69 -> 69, 0x73 -> 73, 0x75 -> 75, 0x79 -> 79
+        size_t htp_arch = ggmlhexagon_htparch_hex_to_decimal(dsp_version);
+        GGMLHEXAGON_LOG_DEBUG("dsp arch version %d", htp_arch);
+        struct qcom_socinfo * socinfo = ggmlhexagon_get_socinfo_from_socmodel(htp_arch);
+        if (nullptr != socinfo) {
+            //got fully description of SoC when hwaccel approach is HWACCEL_CDSP
+            GGMLHEXAGON_LOG_INFO("device info: %s, %s", socinfo->soc_desc, ggmlhexagon_get_htparch_desc(htp_arch));
+        }
+    } else {
+        GGMLHEXAGON_LOG_WARN("error: dsp arch version 0x%x is not supported", dsp_version);
     }
 
-    switch (op_tensor->op) {
-        case GGML_OP_ADD:
-        case GGML_OP_SUB:
-        {
-            if (!ggml_are_same_shape(src0, src1)) {
-                return false;
-            }
-
-            if (ne00 < 32)
-                return false;
+    uint32_t vtcm_count = 0;
+    uint32_t vtcm_page  = 0;
+    ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_COUNT, &vtcm_count);
+    ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_PAGE, &vtcm_page);
+    GGMLHEXAGON_LOG_DEBUG("vtcm_count %d", vtcm_count);
+    GGMLHEXAGON_LOG_DEBUG("vtcm_page %d", vtcm_page);
 
-            return ggmlqnn_same_types(ctx, op_tensor);
-        }
+    uint32_t hmx_depth = 0;
+    uint32_t hmx_spatial = 0;
+    ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_DEPTH, &hmx_depth);
+    ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_SPATIAL, &hmx_spatial);
+    GGMLHEXAGON_LOG_DEBUG("hmx_depth %d", hmx_depth);
+    GGMLHEXAGON_LOG_DEBUG("hmx_spatial %d", hmx_spatial);
 
-        case GGML_OP_DIV:
-        case GGML_OP_MUL: {
-            if (ctx->device == QNN_BACKEND_NPU)
-                return false;
+    uint32_t hvx_support_128b = 0;
+    ggmlhexagon_get_hvx_support_info(ctx->domain_id, HVX_SUPPORT_128B, &hvx_support_128b);
+    GGMLHEXAGON_LOG_DEBUG("hvx_support_128b %d", hvx_support_128b);
 
-            if (!ggml_are_same_shape(src0, src1)) {
-                return false;
-            }
+    GGMLHEXAGON_LOG_DEBUG("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support());
+    GGMLHEXAGON_LOG_DEBUG("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id));
+}
 
-            if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix mul
-                return false;
+static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
+    int hexagon_error               = AEE_SUCCESS;
 
-            return ggmlqnn_same_types(ctx, op_tensor);
-        }
-        case GGML_OP_MUL_MAT:
-        {
-            ggmlqnn_dump_op_info(op_tensor);
-            if (src0_rank != src1_rank) // make QNN SDK happy
-                return false;
+    int domain_id                   = HEXAGON_CDSP;
+    const char * domain_type        = "NSP";
 
-            if (src0_rank < 2) // QNN's limitation, make QNN SDK happy
-                return false;
+    int unsignedpd_flag             = 1;
+    bool is_unsignedpd_enabled      = false;
+    int use_logical_id              = 0;
+    int core_id                     = -1;
+    fastrpc_domain * domains_info   = NULL;
+    fastrpc_domain * domain_info    = NULL;
+    int num_domains                 = -1;
 
-            if (4 == src0_rank) //TODO: 4D matrix mulmat in CT
-                return false;
+    domain * my_domain              = NULL;
+    char * uri                      = NULL;
 
-            if ((src1->ne[2] != src0->ne[2]) || (src1->ne[3] != src0->ne[3])) // make QNN SDK happy
-                return false;
+    char * ggmlop_domain_uri        = NULL;
+    int    ggmlop_domain_uri_len    = 0;
 
-            if (ctx->device == QNN_BACKEND_NPU) {
-                return (src0->type == GGML_TYPE_F32
-                        || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
-                        || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
-                        ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+    if (nullptr == ctx)
+        return 1;
+    GGMLHEXAGON_LOG_INFO("init Hexagon DSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
+    //TODO: reasonable rpc memory pool size and use it practically
+    ctx->ggmlop_handle = -1;
+    ctx->rpc_mempool_len  = (1 << 20) * 512;
+    ctx->rpc_mempool = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, ctx->rpc_mempool_len);
+    if (nullptr == ctx->rpc_mempool) {
+        hexagon_error = AEE_ENORPCMEMORY;
+        printf("rpc memory alloc failed", hexagon_error);
+        ctx->rpc_mempool_len = 0;
+        return 2;
+    }
+
+    if (-1 == domain_id) {
+        if (nullptr != domain_type) {
+            if ((strcmp(domain_type, "NSP") != 0 && strcmp(domain_type, "HPASS") != 0)) {
+                GGMLHEXAGON_LOG_WARN("invalid domain_type %s. possible values are NSP or HPASS", domain_type);
+                goto bail;
             } else {
-                return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type))
-                        && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+                hexagon_error = ggmlhexagon_get_domains_info(domain_type, &num_domains, &domains_info);
+                if (hexagon_error == AEE_EUNSUPPORTED) {
+                    GGMLHEXAGON_LOG_DEBUG("API is not supported on this target so cannot get domains info from the device. falling back to legacy approach of using default domain id");
+                    hexagon_error = ggmlhexagon_get_dsp_support(&domain_id);
+                    if (hexagon_error != AEE_SUCCESS) {
+                        GGMLHEXAGON_LOG_DEBUG("error: 0x%x, defaulting to CDSP domain", hexagon_error);
+                    }
+                } else if (hexagon_error != AEE_SUCCESS) {
+                    GGMLHEXAGON_LOG_DEBUG("error in getting domains information");
+                    goto bail;
+                } else {
+                    if (core_id != -1) {
+                        if (core_id < 0 || core_id >= num_domains) {
+                            GGMLHEXAGON_LOG_DEBUG("invalid core_id = %d for %s. core_id should be between 0 to %d", core_id, domain_type, num_domains - 1);
+                            hexagon_error = AEE_EBADPARM;
+                            goto bail;
+                        }
+                    } else {
+                        core_id = 0;
+                    }
+                    use_logical_id = 1;
+                    domain_id = domains_info[core_id].id;
+                }
+            }
+        } else {
+            GGMLHEXAGON_LOG_DEBUG("DSP domain is not provided, retrieving DSP information using Remote APIs");
+            hexagon_error = ggmlhexagon_get_dsp_support(&domain_id);
+            if (hexagon_error != AEE_SUCCESS) {
+                GGMLHEXAGON_LOG_DEBUG("error: 0x%x, defaulting to CDSP domain", hexagon_error);
             }
         }
-        case GGML_OP_LOG:
-        {
-            if (ctx->device == QNN_BACKEND_NPU)
-                return false;
-        }
-        case GGML_OP_SQRT:
-        default:
-            return ggmlqnn_same_types(ctx, op_tensor);
     }
-}
 
-static bool ggmlqnn_compute_forward(ggml_backend_t backend, struct ggml_tensor * dst) {
-    ggmlqnn_op_func_t func          = nullptr;
-    ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *)backend->context;
+    if (0 == use_logical_id) {
+        if (!ggmlhexagon_is_valid_domain_id(domain_id, 0)) {
+            hexagon_error = AEE_EBADPARM;
+            GGMLHEXAGON_LOG_DEBUG("error 0x%x: invalid domain %d", hexagon_error, domain_id);
+            goto bail;
+        }
 
-    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
-        ggmlhexagon_compute(ctx, dst);
-        return true;
+        my_domain = ggmlhexagon_get_domain(domain_id);
+        if (nullptr == my_domain) {
+            GGMLHEXAGON_LOG_DEBUG("unable to get domain struct %d",  domain_id);
+            goto bail;
+        }
+        uri = my_domain->uri;
     }
+    GGMLHEXAGON_LOG_DEBUG("temporary domain uri=%s\n", uri);
 
-    switch (dst->op) {
-        case GGML_OP_REPEAT:
-            ggmlqnn_compute_repeat(ctx, dst);
-            break;
-        case GGML_OP_GET_ROWS:
-            ggmlqnn_compute_get_rows(ctx, dst);
-            break;
-        case GGML_OP_DUP:
-            ggmlqnn_compute_dup(ctx, dst);
-            break;
-        case GGML_OP_ADD:
-        case GGML_OP_SUB:
-        case GGML_OP_MUL:
-        case GGML_OP_DIV:
-        case GGML_OP_SQRT:
-        case GGML_OP_LOG:
-            func = ggmlqnn_compute_elementwise;
-            break;
-        case GGML_OP_ACC:
-            ggmlqnn_compute_acc(ctx, dst);
-            break;
-        case GGML_OP_UNARY:
-            switch (ggml_get_unary_op(dst)) {
-                case GGML_UNARY_OP_GELU:
-                    break;
-                case GGML_UNARY_OP_SILU:
-                    break;
-                case GGML_UNARY_OP_GELU_QUICK:
-                    break;
-                case GGML_UNARY_OP_TANH:
-                    break;
-                case GGML_UNARY_OP_RELU:
-                    break;
-                case GGML_UNARY_OP_HARDSIGMOID:
-                    break;
-                case GGML_UNARY_OP_HARDSWISH:
-                    break;
-                default:
-                    return false;
-            }
-            break;
-        case GGML_OP_NORM:
-            ggmlqnn_compute_norm(ctx, dst);
-            break;
-        case GGML_OP_GROUP_NORM:
-            ggmlqnn_compute_group_norm(ctx, dst);
-            break;
-        case GGML_OP_CONCAT:
-            ggmlqnn_compute_concat(ctx, dst);
-            break;
-        case GGML_OP_UPSCALE:
-            ggmlqnn_compute_upsample_nearest2d(ctx, dst);
-            break;
-        case GGML_OP_PAD:
-            ggmlqnn_compute_pad(ctx, dst);
-            break;
-        case GGML_OP_ARANGE:
-            ggmlqnn_compute_arange(ctx, dst);
-            break;
-        case GGML_OP_TIMESTEP_EMBEDDING:
-            ggmlqnn_compute_timestep_embedding(ctx, dst);
-            break;
-        case GGML_OP_LEAKY_RELU:
-            ggmlqnn_compute_leaky_relu(ctx, dst);
-            break;
-        case GGML_OP_RMS_NORM:
-            ggmlqnn_compute_rms_norm(ctx, dst);
-            break;
-        case GGML_OP_MUL_MAT:
-            ggmlqnn_compute_mul_mat(ctx, dst);
-            break;
-        case GGML_OP_MUL_MAT_ID:
-            return false;
-        case GGML_OP_SCALE:
-            ggmlqnn_compute_scale(ctx, dst);
-            break;
-        case GGML_OP_SQR:
-            ggmlqnn_compute_sqr(ctx, dst);
-            break;
-        case GGML_OP_CLAMP:
-            ggmlqnn_compute_clamp(ctx, dst);
-            break;
-        case GGML_OP_CPY:
-            ggmlqnn_compute_cpy(ctx, dst);
-            break;
-        case GGML_OP_CONT:
-            ggmlqnn_compute_dup(ctx, dst);
-            break;
-        case GGML_OP_NONE:
-        case GGML_OP_RESHAPE:
-        case GGML_OP_VIEW:
-        case GGML_OP_PERMUTE:
-        case GGML_OP_TRANSPOSE:
-            break;
-        case GGML_OP_DIAG_MASK_INF:
-            ggmlqnn_compute_diag_mask(ctx, dst, -INFINITY);
-            break;
-        case GGML_OP_SOFT_MAX:
-            ggmlqnn_compute_softmax(ctx, dst);
-            break;
-        case GGML_OP_ROPE:
-            ggmlqnn_compute_rope(ctx, dst);
-            break;
-        case GGML_OP_IM2COL:
-            ggmlqnn_compute_im2col(ctx, dst);
-            break;
-        case GGML_OP_POOL_2D:
-            ggmlqnn_compute_pool2d(ctx, dst);
-            break;
-        case GGML_OP_SUM_ROWS:
-            ggmlqnn_compute_sum_rows(ctx, dst);
-            break;
-        case GGML_OP_ARGSORT:
-            ggmlqnn_compute_argsort(ctx, dst);
-            break;
-        default:
-            return false;
+    if (1 == unsignedpd_flag) {
+        is_unsignedpd_enabled = ggmlhexagon_is_unsignedpd_supported(domain_id);
+        if (!is_unsignedpd_enabled) {
+            GGMLHEXAGON_LOG_DEBUG("overriding user request for unsigned PD, only signed offload is allowed on domain %d", domain_id);
+            unsignedpd_flag = 0;
+        }
     }
 
-    if (nullptr != func)
-        func(ctx, dst);
-
-    return true;
-}
-
-struct ggml_backend_qnn_buffer_context {
-    ~ggml_backend_qnn_buffer_context() {
-        if (buffer) {
-            ggml_aligned_free(buffer, 0);
+    ctx->domain_id = domain_id;
+    GGMLHEXAGON_LOG_INFO("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
+    GGMLHEXAGON_LOG_INFO("unsignedpd_enabled %d", is_unsignedpd_enabled);
+    if (is_unsignedpd_enabled) {
+        if (remote_session_control) {
+            struct remote_rpc_control_unsigned_module data;
+            data.enable = 1;
+            data.domain = domain_id;
+            hexagon_error = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *)&data, sizeof(data));
+            GGMLHEXAGON_LOG_DEBUG("remote_session_control returned %d for configuring unsigned PD success", hexagon_error);
+            if (AEE_SUCCESS != hexagon_error) {
+                GGMLHEXAGON_LOG_DEBUG("error 0x%x: remote_session_control failed", hexagon_error);
+            }
+        } else {
+            GGMLHEXAGON_LOG_DEBUG("unsigned PD not supported on this device");
+            hexagon_error = AEE_EUNSUPPORTED;
+            GGMLHEXAGON_LOG_DEBUG("error 0x%x: remote_session_control interface is not supported on this device", hexagon_error);
         }
+    }
 
-        for (auto * sub_buffer : sub_buffers) {
-            free(sub_buffer);
+    hexagon_error = ggmlhexagon_request_status_notifications(domain_id, (void *)STATUS_CONTEXT, ggmlhexagon_pd_status_notifier_callback);
+    if (AEE_SUCCESS != hexagon_error) {
+        if (AEE_EUNSUPPORTEDAPI != hexagon_error) {
+            GGMLHEXAGON_LOG_WARN("error 0x%x: hexagon_request_status_notifications failed", hexagon_error);
         }
+        GGMLHEXAGON_LOG_WARN("error 0x%x: failed to compute on domain %d", hexagon_error, domain_id);
+        goto bail;
+    }
 
-        sub_buffers.clear();
+    ggmlop_domain_uri_len   = strlen(ggmlop_URI) + MAX_DOMAIN_NAMELEN;
+    ggmlop_domain_uri       = (char *)malloc(ggmlop_domain_uri_len);
+    snprintf(ggmlop_domain_uri, ggmlop_domain_uri_len, "%s%s", ggmlop_URI, uri);
+    GGMLHEXAGON_LOG_INFO("ggmlop domain uri:%s", ggmlop_domain_uri);
+    hexagon_error = ggmlop_dsp_open(ggmlop_domain_uri, &ctx->ggmlop_handle);
+    if (AEE_SUCCESS == hexagon_error) {
+        GGMLHEXAGON_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
+        GGMLHEXAGON_LOG_INFO("only support GGML_OP_ADD and GGML_OP_MUL_MAT on cDSP currently");
+        ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_V2_DUTY_CYCLE_MODE, 40, 1);
+        size_t rpcmem_size = 0;
+        ggmlhexagon_probe_dspinfo(ctx, &rpcmem_size);
+        ggmlhexagon_set_rpc_latency(domain_id, RPC_POLL_QOS, 1000);
+    } else {
+        GGMLHEXAGON_LOG_INFO("error 0x%x: failed to open domain %d(%s)", hexagon_error, domain_id,
+                             ggmlhexagon_get_dsp_name(domain_id));
+        goto bail;
     }
-    void * buffer       = nullptr;
 
-    struct ggml_backend_qnn_context * backend_ctx = nullptr;
+    return 0;
+    bail:
+    if (ggmlop_domain_uri) {
+        free(ggmlop_domain_uri);
+    }
 
-    size_t buffer_size  = 0;
-    std::vector<void *> sub_buffers;
-};
+    if (ctx->rpc_mempool) {
+        rpcmem_free(ctx->rpc_mempool);
+        ctx->rpc_mempool        = nullptr;
+        ctx->rpc_mempool_len    = 0;
+        ctx->ggmlop_handle      = -1;
+        ctx->domain_id          = -1;
+    }
 
-static void ggml_backend_qnn_buffer_free_buffer(ggml_backend_buffer_t buffer) {
-    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
-    delete ctx;
+    return -1;
 }
 
-static void * ggml_backend_qnn_buffer_get_base(ggml_backend_buffer_t buffer) {
-    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
-    return ctx->buffer;
-}
+static void ggmlhexagon_close_cdsp(ggml_backend_hexagon_context * ctx) {
+    int hexagon_error  = AEE_SUCCESS;
+    GGMLHEXAGON_LOG_INFO("enter %s", __func__);
+    if (-1 != ctx->ggmlop_handle) {
+        hexagon_error = ggmlop_dsp_close(ctx->ggmlop_handle);
+        if (AEE_SUCCESS != hexagon_error) {
+            GGMLHEXAGON_LOG_WARN("error 0x%x: failed to close ggmlop dsp handle", hexagon_error);
+        } else {
+            ctx->ggmlop_handle = -1;
+        }
+    }
 
-static enum ggml_status ggml_backend_qnn_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
-    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
-    GGML_UNUSED(tensor);
-    GGML_UNUSED(ctx);
-    return GGML_STATUS_SUCCESS;
+    if (ctx->rpc_mempool) {
+        rpcmem_free(ctx->rpc_mempool);
+        ctx->rpc_mempool        = nullptr;
+        ctx->rpc_mempool_len    = 0;
+        ctx->domain_id          = -1;
+    }
+    GGMLHEXAGON_LOG_INFO("leave %s", __func__);
 }
 
-static void ggml_backend_qnn_buffer_set_tensor(ggml_backend_buffer_t buffer,
-                                               ggml_tensor * tensor, const void * data,
-                                               size_t offset, size_t size) {
-    GGML_UNUSED(buffer);
+static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_tensor * op) {
+    //skip sanity check because already checked in other place
+    struct dsptensor dsptensor_0;
+    struct dsptensor dsptensor_1;
+    struct dsptensor dsptensor_2;
+    std::string op_name;
+    ggmlhexagon_get_opkey_from_op(op, op_name);
 
-    memcpy((char *)tensor->data + offset, data, size);
-}
+    hexagon_perf op_perf(op_name);
+    op_perf.start();
 
-static void ggml_backend_qnn_buffer_memset_tensor(ggml_backend_buffer_t buffer,
-                                                  struct ggml_tensor * tensor,
-                                                  uint8_t value, size_t offset, size_t size) {
-    GGML_UNUSED(buffer);
-    memset((char *)tensor->data + offset, value, size);
-}
+    int hexagon_error               = AEE_SUCCESS;
+    ggmlhexagon_op_func_t op_func   = nullptr;
+    size_t input_tensor_count       = 2;
 
-static void ggml_backend_qnn_buffer_get_tensor(ggml_backend_buffer_t buffer,
-                                               const ggml_tensor * tensor,
-                                               void * data, size_t offset, size_t size) {
-    GGML_UNUSED(buffer);
-    memcpy(data, (const char *)tensor->data + offset, size);
-}
+    ggml_tensor * src0  = op->src[0];
+    ggml_tensor * src1  = op->src[1];
+    ggml_tensor * dst   = op;
 
-static bool ggml_backend_qnn_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
-                                               const struct ggml_tensor * src,
-                                               struct ggml_tensor * dst) {
-    GGML_UNUSED(buffer);
-    if (ggml_backend_buffer_is_host(src->buffer)) {
-        memcpy(dst->data, src->data, ggml_nbytes(src));
-        return true;
+    input_tensor_count  =  ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op)].input_param_count;
+    op_func             =  ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op)].dsp_op_func;
+    if (nullptr == op_func) {
+        GGMLHEXAGON_LOG_DEBUG("op GGML_OP_%s and dsp func %s not supported on cCSP", ggml_op_name(op->op), ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op)].hexagon_op_name);
+        return;
     }
 
-    return false;
-}
+    dsptensor_0.data        = src0->data;
+    dsptensor_0.data_len    = ggml_nbytes(src0);
+    dsptensor_0.type        = src0->type;
 
-static void ggml_backend_qnn_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
-    ggml_backend_qnn_buffer_context * ctx = (ggml_backend_qnn_buffer_context *)buffer->context;
-    memset(ctx->buffer, value, ctx->buffer_size);
-}
+    dsptensor_0.ne[0] = src0->ne[0];
+    dsptensor_0.ne[1] = src0->ne[1];
+    dsptensor_0.ne[2] = src0->ne[2];
+    dsptensor_0.ne[3] = src0->ne[3];
 
-static ggml_backend_buffer_i ggml_backend_qnn_buffer_interface = {
-        /* .free_buffer     = */ ggml_backend_qnn_buffer_free_buffer,
-        /* .get_base        = */ ggml_backend_qnn_buffer_get_base,
-        /* .init_tensor     = */ ggml_backend_qnn_buffer_init_tensor,
-        /* .memset_tensor   = */ ggml_backend_qnn_buffer_memset_tensor,
-        /* .set_tensor      = */ ggml_backend_qnn_buffer_set_tensor,
-        /* .get_tensor      = */ ggml_backend_qnn_buffer_get_tensor,
-        /* .cpy_tensor      = */ ggml_backend_qnn_buffer_cpy_tensor,
-        /* .clear           = */ ggml_backend_qnn_buffer_clear,
-        /* .reset           = */ nullptr,
-};
+    dsptensor_0.nb[0] = src0->nb[0];
+    dsptensor_0.nb[1] = src0->nb[1];
+    dsptensor_0.nb[2] = src0->nb[2];
+    dsptensor_0.nb[3] = src0->nb[3];
 
-static const char * ggml_backend_qnn_buffer_type_name(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
-    return "qnn-buffer";
-}
+    if (2 == input_tensor_count) {
+        dsptensor_1.data        = src1->data;
+        dsptensor_1.type        = src1->type;
+        dsptensor_1.data_len    = ggml_nbytes(src1);
 
-static ggml_backend_buffer_t ggml_backend_qnn_buffer_type_alloc_buffer(
-           ggml_backend_buffer_type_t buft, size_t size) {
-    ggml_backend_qnn_buffer_context * ctx = new ggml_backend_qnn_buffer_context;
+        dsptensor_1.ne[0] = src1->ne[0];
+        dsptensor_1.ne[1] = src1->ne[1];
+        dsptensor_1.ne[2] = src1->ne[2];
+        dsptensor_1.ne[3] = src1->ne[3];
 
-    size_t size_page = 0;
-#if defined(__ANDROID__) || defined(__linux__)
-    size_page = sysconf(_SC_PAGESIZE);
-#else
-    SYSTEM_INFO systeminfo;
-    GetSystemInfo(&systeminfo);
-    size_page = systeminfo.dwPageSize;
-#endif
-    size_t size_aligned = size;
-    if ((size_aligned % size_page) != 0) {
-        size_aligned += (size_page - (size_aligned % size_page));
-    }
-    ctx->buffer         = ggml_aligned_malloc(size_aligned);
-    ctx->buffer_size    = size_aligned;
-    if (nullptr == ctx->buffer) {
-        GGMLQNN_LOG_WARN("%s: failed to allocate %d MiB\n", __func__, size / (1 << 20));
-        return nullptr;
+        dsptensor_1.nb[0] = src1->nb[0];
+        dsptensor_1.nb[1] = src1->nb[1];
+        dsptensor_1.nb[2] = src1->nb[2];
+        dsptensor_1.nb[3] = src1->nb[3];
     }
 
-    return ggml_backend_buffer_init(buft, ggml_backend_qnn_buffer_interface, ctx, size);
-}
-
-static size_t ggml_backend_qnn_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
-    return 32;
-}
+    dsptensor_2.data        = dst->data;
+    dsptensor_2.data_len    = ggml_nbytes(dst);
+    dsptensor_2.type        = dst->type;
 
-static size_t ggml_backend_qnn_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
+    dsptensor_2.ne[0] = dst->ne[0];
+    dsptensor_2.ne[1] = dst->ne[1];
+    dsptensor_2.ne[2] = dst->ne[2];
+    dsptensor_2.ne[3] = dst->ne[3];
 
-    return (2 * (1 << 29));
-}
+    dsptensor_2.nb[0] = dst->nb[0];
+    dsptensor_2.nb[1] = dst->nb[1];
+    dsptensor_2.nb[2] = dst->nb[2];
+    dsptensor_2.nb[3] = dst->nb[3];
 
-static bool ggml_backend_qnn_buffer_is_host(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
-    return true;
-}
+    //GGMLQNN_DUMP_DSPTENSOR(&dsptensor_0);
+    //GGMLQNN_DUMP_DSPTENSOR(&dsptensor_1);
+    //GGMLQNN_DUMP_DSPTENSOR(&dsptensor_2);
+    hexagon_error = op_func(ctx->ggmlop_handle, &dsptensor_0, &dsptensor_1, &dsptensor_2);
+    if (AEE_SUCCESS != hexagon_error) {
+        GGMLHEXAGON_LOG_WARN("ggmlop %s computation fail on cdsp", ggml_op_name(op->op));
+    }
 
-static const char * ggml_backend_qnn_name(ggml_backend_t backend) {
-    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) backend->context;
-    return g_qnn_mgr[ctx->device].name;
+    op_perf.info();
+    return;
 }
 
-static void ggml_backend_qnn_free(ggml_backend_t backend) {
-    GGMLQNN_LOG_DEBUG("enter %s", __func__ );
-    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *)backend->context;
+// =================================================================================================
+//  section-8: implementation of ggml-hexagon backend according to specification in ggml backend subsystem
+// =================================================================================================
+//hwaccel through cDSP
+static bool ggmlhexagon_can_handle_op(const ggml_backend_hexagon_context * ctx, const struct ggml_tensor * op_tensor) {
+    ggmlhexagon_dump_op_info(op_tensor);
+    if (!ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op_tensor)].supported) {
+        return false;
+    }
 
-    qnn_instance * instance = (qnn_instance*)g_qnn_mgr[ctx->device].instance;
-    if (instance != nullptr) {
-        std::map<std::string, qnn_singlenode_res_t>::iterator singlenode_graph_it;
-        for (singlenode_graph_it = ctx->qnn_singlenode_graph_map.begin();
-             singlenode_graph_it != ctx->qnn_singlenode_graph_map.end(); singlenode_graph_it++) {
-            auto & graph_res = singlenode_graph_it->second;
-            Qnn_GraphHandle_t & graph_handle    = std::get<0>(graph_res);
-            qnn_ptensors_t    & ptensors        = std::get<1>(graph_res);
-            for (auto tensor_it = ptensors.begin(); tensor_it != ptensors.end(); ++tensor_it) {
-                free_qnn_tensor(*tensor_it);
-            }
-            GGML_UNUSED(graph_handle);
-            GGMLQNN_LOG_DEBUG("clean up graph:%s", singlenode_graph_it->first.c_str());
-        }
-        ctx->qnn_singlenode_graph_map.clear();
+    struct ggml_tensor * src0 = op_tensor->src[0];
+    struct ggml_tensor * src1 = op_tensor->src[1];
+    const int64_t ne00  = op_tensor->src[0]->ne[0];
+    uint32_t src0_rank  = ggml_n_dims(src0);
+    uint32_t src1_rank  = 0;
+    if (nullptr != src1) {
+        src1_rank = ggml_n_dims(src1);
+    }
 
-        instance->qnn_finalize();
-        delete instance;
-        g_qnn_mgr[ctx->device].instance = nullptr;
+    //available in the early stage, should be removed in the product stage
+    bool support = false;
+    if (g_hexagon_appcfg.enable_mulmat_cdsp)
+        support = ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
+    else
+        support = (op_tensor->op == GGML_OP_ADD);
+    if (!support) {
+        return false;
     }
 
-    if (g_qnn_mgr[ctx->device].backend != nullptr) {
-        if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
-            ggmlhexagon_close_cdsp(ctx);
+    switch (op_tensor->op) {
+        case GGML_OP_ADD:
+        case GGML_OP_SUB:
+        {
+            if (!ggml_are_same_shape(src0, src1)) {
+                return false;
+            }
+            break;
         }
+        case GGML_OP_MUL_MAT:
+        {
+            ggmlhexagon_dump_op_info(op_tensor);
 
-        delete backend;
-        g_qnn_mgr[ctx->device].backend = nullptr;
-        ggmlqnn_print_running_timestamp(ctx);
-    }
-    GGMLQNN_LOG_DEBUG("leave %s", __func__ );
-}
-
-static enum ggml_status ggmlqnn_backend_graph_compute_general(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
-    enum ggml_status result         = GGML_STATUS_SUCCESS;
-    ggml_backend_qnn_context * ctx  = (ggml_backend_qnn_context *)backend->context;
-    GGML_UNUSED(ctx);
+            //TODO:3d&4d matrix mulmat on cDSP
+            if (src0_rank != 2)
+                return false;
 
-    for (int i = 0; i < cgraph->n_nodes; i++) {
-        ggml_tensor * node = cgraph->nodes[i];
-        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE
-            || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW
-            || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
-            continue;
-        }
-        bool ok = ggmlqnn_compute_forward(backend, node);
-        if (!ok) {
-            GGMLQNN_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+            if (g_hexagon_appcfg.enable_q_mulmat)
+                return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type))
+                       && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+            else
+                return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
         }
+        default:
+            break;
     }
-
-    return result;
+    return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
 }
 
-static const char * ggml_backend_qnn_device_get_name(ggml_backend_dev_t dev) {
-    struct ggml_backend_qnn_context *ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
-    if (nullptr == ctx) {
-        GGMLQNN_LOG_ERROR("pls check why ctx is null");
-        return "unknown";
+static bool ggmlbackend_can_handle_op(const ggml_backend_hexagon_context * ctx, const struct ggml_tensor * op_tensor) {
+    if (op_tensor->op == GGML_OP_NONE) {
+        return true;
     }
-    return ctx->name;
-}
 
-static const char * ggml_backend_qnn_device_get_description(ggml_backend_dev_t dev) {
-    struct ggml_backend_qnn_context * ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
-    static char qnn_device_desc[256];
-    if (nullptr == ctx) {
-        GGMLQNN_LOG_ERROR("pls check why ctx is null");
-        return "unknown";
-    }
-    if (0 == strncmp(ctx->name, "qnn-npu", 7)) {
-        const char * soc_info = ggmlqnn_get_socmodel_desc(ctx->socinfo.soc_model);
-        const char * htp_arch = ggmlqnn_get_htparch_desc(ctx->socinfo.htp_arch);
-        std::string dev_desc = std::string(ctx->desc)
-                + std::string(soc_info) + "_" + std::string(htp_arch)
-                + "," + std::string(ctx->socinfo.soc_desc);
-        memset(qnn_device_desc, 0, 256);
-        memcpy(qnn_device_desc, dev_desc.c_str(), strlen(dev_desc.c_str()));
-        return qnn_device_desc;
-    } else {
-        return ctx->desc;
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        return ggmlhexagon_can_handle_op(ctx, op_tensor);
     }
-}
 
-static void ggml_backend_qnn_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
-    struct ggml_backend_qnn_context * ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
-    if ((nullptr == ctx) || (ctx->device > QNN_BACKEND_GGML)) {
-        GGMLQNN_LOG_ERROR("pls check params");
-        *free = 0;
-        *total = 0;
+    if (!ggmlqnn_k_op_caps[ggmlhexagon_get_op_index(op_tensor)].supported) {
+        return false;
     }
 
-    if (QNN_BACKEND_CPU == ctx->device || QNN_BACKEND_GGML == ctx->device) {
-        *total = ggmlqnn_get_system_total_memory_in_bytes();
-        *free = ggmlqnn_get_system_free_memory_in_bytes();
-    } else if (QNN_BACKEND_GPU == ctx->device) {
-        //TODO: probe GPU info in Qualcomm Adreno GPU
-        *total = ggmlqnn_get_system_total_memory_in_bytes();
-        *free = ggmlqnn_get_system_free_memory_in_bytes();
-    } else if (QNN_BACKEND_NPU == ctx->device) {
-        size_t rpc_ion_memsize = 0;
-        size_t rpc_ion_usage   = 0;
-        if (HWACCEL_CDSP != g_qnn_params.hwaccel_approach) {
-            rpc_ion_memsize = ctx->instance->get_rpcmem_capacity();
-            rpc_ion_usage   = ctx->instance->get_rpcmem_usage();
-        } else {
-            ggmlhexagon_probe_dspinfo(ctx, &rpc_ion_memsize);
-        }
-        GGMLQNN_LOG_DEBUG("rpc memsize %d M", rpc_ion_memsize);
-        GGMLQNN_LOG_DEBUG("rpc usage %d M", rpc_ion_usage);
-        *total = rpc_ion_memsize * (1 << 20);
-        *free = (rpc_ion_memsize - rpc_ion_usage) * (1 << 20);
+    struct ggml_tensor * src0 = op_tensor->src[0];
+    struct ggml_tensor * src1 = op_tensor->src[1];
+    const int64_t ne00  = op_tensor->src[0]->ne[0];
+    uint32_t src0_rank  = ggml_n_dims(src0);
+    uint32_t src1_rank  = 0;
+    if (nullptr != src1) {
+        src1_rank = ggml_n_dims(src1);
     }
-}
-
-static enum ggml_backend_dev_type ggml_backend_qnn_device_get_type(ggml_backend_dev_t dev) {
-    struct ggml_backend_qnn_context * ctx = static_cast<ggml_backend_qnn_context *>(dev->context);
-    if (QNN_BACKEND_CPU == ctx->device)
-        return GGML_BACKEND_DEVICE_TYPE_ACCEL;
-    else if (QNN_BACKEND_GPU == ctx->device)
-        return GGML_BACKEND_DEVICE_TYPE_ACCEL;
-    else if (QNN_BACKEND_NPU == ctx->device)
-        return GGML_BACKEND_DEVICE_TYPE_ACCEL;
-    else
-        return GGML_BACKEND_DEVICE_TYPE_CPU;
-}
 
-static void ggml_backend_qnn_device_get_props(ggml_backend_dev_t dev,
-                                              struct ggml_backend_dev_props * props) {
-    props->name        = ggml_backend_qnn_device_get_name(dev);
-    props->description = ggml_backend_qnn_device_get_description(dev);
-    props->type        = ggml_backend_qnn_device_get_type(dev);
-    ggml_backend_qnn_device_get_memory(dev, &props->memory_free, &props->memory_total);
-    props->caps = {
-            /* .async                 = */ false,
-            /* .host_buffer           = */ false,
-            /* .buffer_from_host_ptr  = */ true,
-            /* .events                = */ false,
-    };
-}
+    switch (op_tensor->op) {
+        case GGML_OP_ADD:
+        case GGML_OP_SUB:
+        {
+            if (!ggml_are_same_shape(src0, src1)) {
+                return false;
+            }
 
-static ggml_backend_t ggml_backend_qnn_device_init_backend(ggml_backend_dev_t dev, const char * params) {
-    GGML_UNUSED(dev);
-    GGMLQNN_LOG_INFO("enter %s\n", __func__);
-    size_t dev_index = 0;
+            if (ne00 < 32)
+                return false;
 
-    //case-1: test-backend-ops or other similar scenairo: calling ggml_backend_dev_init(dev, reinterpret_cast<const char *>(i)) directly in user's code
-    ggmlqnn_load_cfg();
-    GGMLQNN_LOG_INFO("user's specified qnn_backend in cfgfile = %d", g_qnn_params.qnn_backend);
-    GGMLQNN_LOG_INFO("user's sepcified qnn runtime lib path in cfgfile = %s", g_qnn_params.qnn_runtimelib_path);
+            return ggmlhexagon_same_types(ctx, op_tensor);
+        }
 
-    if (nullptr == params) {
-        GGMLQNN_LOG_INFO("program specified param is nullptr\n");
-        dev_index = (g_qnn_params.qnn_backend > 0) ? g_qnn_params.qnn_backend : 0;
-        if (dev_index >= GGML_QNN_MAX_DEVICES) {
-            GGMLQNN_LOG_INFO("assume the default ggml backend\n");
-            return nullptr;
-        }
-    } else {
-        GGMLQNN_LOG_INFO("program specified param is not nullptr\n");
-        //user's program calling ggml_backend_qnn_device_init_backend directly
-        dev_index = (int)(intptr_t)params;
-        g_qnn_params.qnn_backend = dev_index;
-        GGMLQNN_LOG_INFO("program specified dev_index %d\n", dev_index);
-    }
-    GGMLQNN_LOG_INFO("qnn_backend=%d", dev_index);
-    ggml_backend_t qnn_backend = ggml_backend_qnn_init(dev_index, g_qnn_params.qnn_runtimelib_path);
-    GGMLQNN_LOG_INFO("leave %s\n", __func__);
-
-    return qnn_backend;
+        case GGML_OP_DIV:
+        case GGML_OP_MUL: {
+            if (ctx->device == HEXAGON_BACKEND_QNNNPU)
+                return false;
 
-}
+            if (!ggml_are_same_shape(src0, src1)) {
+                return false;
+            }
 
-static ggml_backend_buffer_type_t ggml_backend_qnn_buffer_type(size_t device_index) {
-    if (device_index >= GGML_QNN_MAX_DEVICES) {
-        GGMLQNN_LOG_DEBUG("ggml_backend_qnn_buffer_type error: device_index:%d is out of range [0, %d]\n",
-                      device_index, GGML_QNN_MAX_DEVICES - 1);
-        return nullptr;
-    }
+            if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix mul
+                return false;
 
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_qnn = {
-            /* .iface   = */ {
-                                     /* .get_name         = */ ggml_backend_qnn_buffer_type_name,
-                                     /* .alloc_buffer     = */ ggml_backend_qnn_buffer_type_alloc_buffer,
-                                     /* .get_alignment    = */ ggml_backend_qnn_buffer_type_get_alignment,
-                                     /* .get_max_size     = */ ggml_backend_qnn_buffer_type_get_max_size,
-                                     /* .get_alloc_size   = */ nullptr,// defaults to ggml_nbytes
-                                     /* .is_host          = */ ggml_backend_qnn_buffer_is_host
-                             },
-            /* .device  = */ nullptr,
-            /* .context = */ nullptr,
-    };
+            return ggmlhexagon_same_types(ctx, op_tensor);
+        }
+        case GGML_OP_MUL_MAT:
+        {
+            ggmlhexagon_dump_op_info(op_tensor);
+            if (src0_rank != src1_rank) // make QNN SDK happy
+                return false;
 
-    return &ggml_backend_buffer_type_qnn;
-}
+            if (src0_rank < 2) // QNN's limitation, make QNN SDK happy
+                return false;
 
-static ggml_backend_buffer_type_t ggml_backend_qnn_device_get_buffer_type(ggml_backend_dev_t dev) {
-    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) dev->context;
-    return ggml_backend_qnn_buffer_type(ctx->device);
-}
+            if (4 == src0_rank) //TODO: 4D matrix mulmat in CT
+                return false;
 
-static ggml_backend_buffer_t ggml_backend_qnn_device_buffer_from_host_ptr(ggml_backend_dev_t dev,
-                                                void * ptr, size_t size, size_t max_tensor_size) {
-    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
+            if ((src1->ne[2] != src0->ne[2]) || (src1->ne[3] != src0->ne[3])) // make QNN SDK happy
+                return false;
 
-    GGML_UNUSED(dev);
-    GGML_UNUSED(max_tensor_size);
+            if (ctx->device == HEXAGON_BACKEND_QNNNPU) {
+                return (src0->type == GGML_TYPE_F32
+                        || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
+                        || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
+                        ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+            } else {
+                return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type))
+                        && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+            }
+        }
+        case GGML_OP_LOG:
+        {
+            if (ctx->device == HEXAGON_BACKEND_QNNNPU)
+                return false;
+        }
+        case GGML_OP_SQRT:
+        default:
+            return ggmlhexagon_same_types(ctx, op_tensor);
+    }
 }
 
-static bool ggml_backend_qnn_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-    ggml_backend_qnn_context * ctx = (ggml_backend_qnn_context *) dev->context;
-    return (ggmlqnn_can_handle_op(ctx,op));
-}
+static bool ggmlhexagon_compute_forward(ggml_backend_t backend, struct ggml_tensor * dst) {
+    ggmlqnn_op_func_t func          = nullptr;
+    ggml_backend_hexagon_context * ctx  = (ggml_backend_hexagon_context *)backend->context;
 
-static bool ggml_backend_qnn_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(dev);
-    return ggml_backend_buft_is_host(buft);
-}
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        ggmlhexagon_compute(ctx, dst);
+        return true;
+    }
 
-static struct ggml_backend_device_i ggml_backend_qnn_device_interface = {
-        /* .get_name             = */ ggml_backend_qnn_device_get_name,
-        /* .get_description      = */ ggml_backend_qnn_device_get_description,
-        /* .get_memory           = */ ggml_backend_qnn_device_get_memory,
-        /* .get_type             = */ ggml_backend_qnn_device_get_type,
-        /* .get_props            = */ ggml_backend_qnn_device_get_props,
-        /* .init_backend         = */ ggml_backend_qnn_device_init_backend,
-        /* .get_buffer_type      = */ ggml_backend_qnn_device_get_buffer_type,
-        /* .get_host_buffer_type = */ nullptr,
-        /* .buffer_from_host_ptr = */ ggml_backend_qnn_device_buffer_from_host_ptr,
-        /* .supports_op          = */ ggml_backend_qnn_device_supports_op,
-        /* .supports_buft        = */ ggml_backend_qnn_device_supports_buft,
-        /* .offload_op           = */ nullptr,
-        /* .event_new            = */ nullptr,
-        /* .event_free           = */ nullptr,
-        /* .event_synchronize    = */ nullptr,
-};
+    switch (dst->op) {
+        case GGML_OP_REPEAT:
+            ggmlqnn_compute_repeat(ctx, dst);
+            break;
+        case GGML_OP_GET_ROWS:
+            ggmlqnn_compute_get_rows(ctx, dst);
+            break;
+        case GGML_OP_DUP:
+            ggmlqnn_compute_dup(ctx, dst);
+            break;
+        case GGML_OP_ADD:
+        case GGML_OP_SUB:
+        case GGML_OP_MUL:
+        case GGML_OP_DIV:
+        case GGML_OP_SQRT:
+        case GGML_OP_LOG:
+            func = ggmlqnn_compute_elementwise;
+            break;
+        case GGML_OP_ACC:
+            ggmlqnn_compute_acc(ctx, dst);
+            break;
+        case GGML_OP_UNARY:
+            switch (ggml_get_unary_op(dst)) {
+                case GGML_UNARY_OP_GELU:
+                    break;
+                case GGML_UNARY_OP_SILU:
+                    break;
+                case GGML_UNARY_OP_GELU_QUICK:
+                    break;
+                case GGML_UNARY_OP_TANH:
+                    break;
+                case GGML_UNARY_OP_RELU:
+                    break;
+                case GGML_UNARY_OP_HARDSIGMOID:
+                    break;
+                case GGML_UNARY_OP_HARDSWISH:
+                    break;
+                default:
+                    return false;
+            }
+            break;
+        case GGML_OP_NORM:
+            ggmlqnn_compute_norm(ctx, dst);
+            break;
+        case GGML_OP_GROUP_NORM:
+            ggmlqnn_compute_group_norm(ctx, dst);
+            break;
+        case GGML_OP_CONCAT:
+            ggmlqnn_compute_concat(ctx, dst);
+            break;
+        case GGML_OP_UPSCALE:
+            ggmlqnn_compute_upsample_nearest2d(ctx, dst);
+            break;
+        case GGML_OP_PAD:
+            ggmlqnn_compute_pad(ctx, dst);
+            break;
+        case GGML_OP_ARANGE:
+            ggmlqnn_compute_arange(ctx, dst);
+            break;
+        case GGML_OP_TIMESTEP_EMBEDDING:
+            ggmlqnn_compute_timestep_embedding(ctx, dst);
+            break;
+        case GGML_OP_LEAKY_RELU:
+            ggmlqnn_compute_leaky_relu(ctx, dst);
+            break;
+        case GGML_OP_RMS_NORM:
+            ggmlqnn_compute_rms_norm(ctx, dst);
+            break;
+        case GGML_OP_MUL_MAT:
+            ggmlqnn_compute_mul_mat(ctx, dst);
+            break;
+        case GGML_OP_MUL_MAT_ID:
+            return false;
+        case GGML_OP_SCALE:
+            ggmlqnn_compute_scale(ctx, dst);
+            break;
+        case GGML_OP_SQR:
+            ggmlqnn_compute_sqr(ctx, dst);
+            break;
+        case GGML_OP_CLAMP:
+            ggmlqnn_compute_clamp(ctx, dst);
+            break;
+        case GGML_OP_CPY:
+            ggmlqnn_compute_cpy(ctx, dst);
+            break;
+        case GGML_OP_CONT:
+            ggmlqnn_compute_dup(ctx, dst);
+            break;
+        case GGML_OP_NONE:
+        case GGML_OP_RESHAPE:
+        case GGML_OP_VIEW:
+        case GGML_OP_PERMUTE:
+        case GGML_OP_TRANSPOSE:
+            break;
+        case GGML_OP_DIAG_MASK_INF:
+            ggmlqnn_compute_diag_mask(ctx, dst, -INFINITY);
+            break;
+        case GGML_OP_SOFT_MAX:
+            ggmlqnn_compute_softmax(ctx, dst);
+            break;
+        case GGML_OP_ROPE:
+            ggmlqnn_compute_rope(ctx, dst);
+            break;
+        case GGML_OP_IM2COL:
+            ggmlqnn_compute_im2col(ctx, dst);
+            break;
+        case GGML_OP_POOL_2D:
+            ggmlqnn_compute_pool2d(ctx, dst);
+            break;
+        case GGML_OP_SUM_ROWS:
+            ggmlqnn_compute_sum_rows(ctx, dst);
+            break;
+        case GGML_OP_ARGSORT:
+            ggmlqnn_compute_argsort(ctx, dst);
+            break;
+        default:
+            return false;
+    }
 
-static ggml_backend_i ggml_backend_qnn_interface = {
-        /* .get_name                = */ ggml_backend_qnn_name,
-        /* .free                    = */ ggml_backend_qnn_free,
-        /* .set_tensor_async        = */ nullptr,
-        /* .get_tensor_async        = */ nullptr,
-        /* .cpy_tensor_async        = */ nullptr,
-        /* .synchronize             = */ nullptr,
-        /* .graph_plan_create       = */ nullptr,
-        /* .graph_plan_free         = */ nullptr,
-        /* .graph_plan_update       = */ nullptr,
-        /* .graph_plan_compute      = */ nullptr,
-        /* .graph_compute           = */ nullptr,
-        /* .event_record            = */ nullptr,
-        /* .event_wait              = */ nullptr,
-};
+    if (nullptr != func)
+        func(ctx, dst);
 
-//FIXME: this guid is not make sense
-static ggml_guid_t ggml_backend_qnn_guid() {
-    static ggml_guid guid = {
-            0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81,
-            0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09
-    };
-    return &guid;
+    return true;
 }
 
-bool ggml_backend_is_qnn(ggml_backend_t backend) {
-    return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_qnn_guid());
+static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+    ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context;
+    delete ctx;
 }
 
-void ggml_backend_qnn_set_n_threads(ggml_backend_t backend, int n_threads) {
-    GGML_ASSERT(ggml_backend_is_qnn(backend));
-
-    struct ggml_backend_qnn_context * ctx = (struct ggml_backend_qnn_context *)backend->context;
-    ctx->n_threads = n_threads;
+static void * ggml_backend_hexagon_buffer_get_base(ggml_backend_buffer_t buffer) {
+    ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context;
+    return ctx->buffer;
 }
 
-int ggml_backend_qnn_get_device_count() {
-    return GGML_QNN_MAX_DEVICES;
+static enum ggml_status ggml_backend_hexagon_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+    ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context;
+    GGML_UNUSED(tensor);
+    GGML_UNUSED(ctx);
+    return GGML_STATUS_SUCCESS;
 }
 
-struct ggml_backend_qnn_reg_context {
-    std::vector<ggml_backend_dev_t> devices;
-};
+static void ggml_backend_hexagon_buffer_set_tensor(ggml_backend_buffer_t buffer,
+                                               ggml_tensor * tensor, const void * data,
+                                               size_t offset, size_t size) {
+    GGML_UNUSED(buffer);
 
-static const char * ggml_backend_qnn_reg_get_name(ggml_backend_reg_t reg) {
-    GGML_UNUSED(reg);
-    return "ggml-hexagon";
+    memcpy((char *)tensor->data + offset, data, size);
 }
 
-static size_t ggml_backend_qnn_reg_get_device_count(ggml_backend_reg_t reg) {
-    GGML_UNUSED(reg);
-    return GGML_QNN_MAX_DEVICES;
+static void ggml_backend_hexagon_buffer_memset_tensor(ggml_backend_buffer_t buffer,
+                                                  struct ggml_tensor * tensor,
+                                                  uint8_t value, size_t offset, size_t size) {
+    GGML_UNUSED(buffer);
+    memset((char *)tensor->data + offset, value, size);
 }
 
-static ggml_backend_dev_t ggml_backend_qnn_reg_get_device(ggml_backend_reg_t reg, size_t index) {
-    GGML_UNUSED(reg);
-    GGML_UNUSED(index);
-
-    GGMLQNN_LOG_DEBUG("index %d", index);
-    ggml_backend_qnn_reg_context * ctx = (ggml_backend_qnn_reg_context *)reg->context;
-    GGML_ASSERT(index < ctx->devices.size());
-    return ctx->devices[index];
+static void ggml_backend_hexagon_buffer_get_tensor(ggml_backend_buffer_t buffer,
+                                               const ggml_tensor * tensor,
+                                               void * data, size_t offset, size_t size) {
+    GGML_UNUSED(buffer);
+    memcpy(data, (const char *)tensor->data + offset, size);
 }
 
-static void * ggml_backend_qnn_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
-    GGML_UNUSED(reg);
-
-    if (nullptr == name)
-        return nullptr;
-
-    const char * slot_name =  "ggml_backend_set_n_threads";
-    if (0 == memcmp(name, slot_name, strlen(slot_name))) {
-        return (void *)ggml_backend_qnn_set_n_threads;
+static bool ggml_backend_hexagon_buffer_cpy_tensor(ggml_backend_buffer_t buffer,
+                                               const struct ggml_tensor * src,
+                                               struct ggml_tensor * dst) {
+    GGML_UNUSED(buffer);
+    if (ggml_backend_buffer_is_host(src->buffer)) {
+        memcpy(dst->data, src->data, ggml_nbytes(src));
+        return true;
     }
-    return nullptr;
-}
 
-static const ggml_backend_reg_i ggml_backend_qnn_reg_interface = {
-        /* .get_name          = */ ggml_backend_qnn_reg_get_name,
-        /* .get_device_count  = */ ggml_backend_qnn_reg_get_device_count,
-        /* .get_device        = */ ggml_backend_qnn_reg_get_device,
-        /* .get_proc_address  = */ ggml_backend_qnn_reg_get_proc_address,
-};
-
-ggml_backend_reg_t ggml_backend_qnn_reg() {
-    static ggml_backend_reg reg;
-    static bool initialized = false;
-    GGMLQNN_LOG_DEBUG("enter ggml_backend_qnn_reg");
-
-    //case-2: normal scenario, such as llama-cli or UI applicaton
-    ggmlqnn_load_cfg();
-    GGMLQNN_LOG_INFO("inference approach=%d(%s)", g_qnn_params.hwaccel_approach,
-                     ggmlqnn_get_hwaccel_approach_name(g_qnn_params.hwaccel_approach));
-    GGMLQNN_LOG_INFO("user's specified qnn_backend=%d", g_qnn_params.qnn_backend);
-    GGMLQNN_LOG_INFO("user's specified qnn runtime lib path=%s", g_qnn_params.qnn_runtimelib_path);
-    if (g_qnn_params.qnn_backend >= GGML_QNN_MAX_DEVICES) {
-        GGMLQNN_LOG_INFO("assume default ggml backend\n");
-        GGMLQNN_LOG_DEBUG("leave ggml_backend_qnn_reg");
-        return nullptr;
-    }
-
-    {
-        static std::mutex mutex;
-        std::lock_guard<std::mutex> lock(mutex);
-        if (!initialized) {
-            ggml_backend_qnn_reg_context * ctx = new ggml_backend_qnn_reg_context;
-
-            for (int i = 0; i < ggml_backend_qnn_get_device_count(); i++) {
-                ggml_backend_dev_t dev = new ggml_backend_device {
-                        /* .iface       = */ ggml_backend_qnn_device_interface,
-                        /* .reg         = */ &reg,
-                        /* .context     = */ &g_qnn_mgr[i]
-                };
-                ctx->devices.push_back(dev);
-            }
-
-            reg = ggml_backend_reg {
-                    /* .api_version = */ GGML_BACKEND_API_VERSION,
-                    /* .iface       = */ ggml_backend_qnn_reg_interface,
-                    /* .context     = */ ctx
-            };
-        }
-
-        initialized = true;
-    }
-    GGMLQNN_LOG_DEBUG("leave ggml_backend_qnn_reg");
-
-    return &reg;
-}
-
-const char * ggml_backend_qnn_get_devname(size_t dev_num) {
-    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
-        if (dev_num == QNN_BACKEND_GGML)
-            return "ggml";
-        else
-            return "ggml-hexagon";
-    }
-
-    switch (dev_num) {
-        case QNN_BACKEND_CPU:
-            return "QNN-CPU";
-        case QNN_BACKEND_GPU:
-            return "QNN-GPU";
-        case QNN_BACKEND_NPU:
-            return "QNN-NPU";
-        case QNN_BACKEND_GGML:
-            return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML
-        default:
-            return "unknown";
-    }
-}
-
-static qnn_instance * ggmlqnn_init_qnn_instance(size_t device, const char * qnn_lib_path) {
-    int result = 0;
-    GGMLQNN_LOG_INFO("hwaccel approach=%d(%s)", g_qnn_params.hwaccel_approach,
-                     ggmlqnn_get_hwaccel_approach_name(g_qnn_params.hwaccel_approach));
-
-    qnn_instance * instance = nullptr;
-    instance = new qnn_instance(qnn_lib_path, g_qnn_mgr[device].lib, "");
-    result = instance->qnn_init(nullptr);
-    if (0 != result) {
-        GGMLQNN_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n",
-                         ggml_backend_qnn_get_devname(device));
-        delete instance;
-        return nullptr;
-    }
-    qnn_interface qnn_interface = instance->get_qnn_interface();
-    if (!qnn_interface.is_loaded()) {
-        GGMLQNN_LOG_WARN("qnn subsystem failure\n");
-        delete instance;
-        return nullptr;
-    }
-
-    std::string device_name = ggml_backend_qnn_get_devname(device);
-    GGMLQNN_LOG_INFO("qnn device name %s", device_name.c_str());
-    g_qnn_mgr[device].instance = instance;
-    g_qnn_mgr[device].raw_interface = instance->get_qnn_raw_interface();
-    g_qnn_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface();
-
-    return instance;
-}
-
-/**
- *
- * @param device            0: QNN_BACKEND_CPU 1: QNN_BACKEND_GPU 2: QNN_BACKEND_NPU
- * @param qnn_lib_path      QNN binrary runtime library path, such as "/data/local/tmp/" on Android or specified in JNI layer
- * @return
- */
-ggml_backend_t ggml_backend_qnn_init(size_t device, const char * qnn_lib_path) {
-    int result = 0;
-
-    GGMLQNN_LOG_INFO("enter %s\n", __func__);
-    //case-3: calling ggml_backend_qnn_init() directly in user's code
-    ggmlqnn_load_cfg();
-
-    if (nullptr == qnn_lib_path)
-        return nullptr;
-
-    GGMLQNN_LOG_DEBUG("device %d", device);
-    GGMLQNN_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path);
-    if (device >= GGML_QNN_MAX_DEVICES) {
-        GGMLQNN_LOG_ERROR("invalid device %d", device);
-        return nullptr;
-    }
-
-#if defined(__ANDROID__)
-    std::string path = qnn_lib_path;
-    GGMLQNN_LOG_INFO("lib_path %s", path.c_str());
-    ggmlqnn_set_runtime_path(device, path);
-#endif
-
-    if (nullptr != g_qnn_mgr[device].backend) {
-        GGMLQNN_LOG_INFO("backend %d(%s) already loaded", device,
-                         ggml_backend_qnn_get_devname(device));
-        GGMLQNN_LOG_INFO("leave %s\n", __func__);
-        return g_qnn_mgr[device].backend;
-    }
-
-    //don't initialize QNN when hwaccel approach is offload ggml op to Hexagon cDSP directly
-    if (HWACCEL_CDSP != g_qnn_params.hwaccel_approach) {
-        qnn_instance * instance = ggmlqnn_init_qnn_instance(device, qnn_lib_path);
-        if (nullptr == instance)
-            return nullptr;
-    }
-
-    ggml_backend_qnn_interface.graph_compute = ggmlqnn_backend_graph_compute_general;
-
-    ggml_backend_t qnn_backend = new ggml_backend{
-            /* .guid      = */ ggml_backend_qnn_guid(),
-            /* .iface     = */ ggml_backend_qnn_interface,
-            /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_qnn_reg(), device),
-            /* .context   = */ &g_qnn_mgr[device]
-    };
-
-    g_qnn_mgr[device].backend = qnn_backend;
-    if (HWACCEL_CDSP == g_qnn_params.hwaccel_approach) {
-        int result = ggmlhexagon_init_dsp(&g_qnn_mgr[device]);
-        if (0 != result) {
-            GGMLQNN_LOG_INFO("init hexagon dsp failure");
-            ggml_backend_qnn_free(qnn_backend);
-            return nullptr;
-        }
-        //ensure test-backend-ops get the correct backend name when inference approach is 1(HWACCEL_CDSP)
-        memcpy(g_qnn_mgr[device].name, "Hexagon-cDSP", strlen("Hexagon-cDSP"));
-    }
-
-    GGMLQNN_LOG_INFO("leave %s\n", __func__);
-
-    return qnn_backend;
-}
-
-GGML_BACKEND_DL_IMPL(ggml_backend_qnn_reg)
-
-// =================================================================================================
-//  section-9: general approach: offload GGML op to QNN backend or offload GGML op to Hexagon cDSP directly
-// =================================================================================================
-static inline uint32_t ggmlqnn_get_tensor_data_size(const ggml_tensor * tensor) {
-    /*
-    size_t data_size = ggml_row_size(tensor->type, tensor->ne[0]);
-    size_t n_dims = ggml_get_tensor_rank(tensor);
-    for (int i = 1; i < n_dims; i++) {
-        data_size *= tensor->ne[i];
-    }
-
-    return data_size;
-    */
-    return ggml_nbytes(tensor);
-}
-
-static inline bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
-                                           const ggml_tensor * src1, ggml_tensor * dst) {
-    if ((nullptr == ctx) || (nullptr == src0) || (nullptr == dst)) {
-        GGMLQNN_LOG_WARN("invalid params\n");
-        return false;
-    }
-
-    qnn_instance * instance = ctx->instance;
-    if (nullptr == instance) {
-        GGMLQNN_LOG_WARN("invalid params\n");
-        return false;
-    }
-
-    return true;
+    return false;
 }
 
-/*
- * provide a general skeleton to offload ggml op to QNN backend: perform element-wise
- * operation on 1/2 input tensors and 1 output tensors
-*/
-static void ggmlqnn_compute_elementwise(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
-    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
-    qnn_instance * instance                     = nullptr;
-    Qnn_GraphHandle_t graph_handle              = nullptr;
-    Qnn_Tensor_t * p_tensor0                    = nullptr;
-    Qnn_Tensor_t * p_tensor1                    = nullptr;
-    Qnn_Tensor_t * p_tensor2                    = nullptr;
-    const ggml_tensor * src0                    = op->src[0];
-    const ggml_tensor * src1                    = op->src[1];
-    ggml_tensor * dst                           = op;
-
-    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-    instance                                    = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
-    size_t qnn_op_index                         = ggmlqnn_get_op_index(op);
-    const char * qnn_op_name                    = ggmlqnn_k_op_caps[qnn_op_index].qnn_op_name;
-    size_t input_param_count                    = ggmlqnn_k_op_caps[qnn_op_index].input_param_count;
-    std::string ggml_op_name_string             = std::string("ggml_") + ggml_op_name(op->op);
-    const char * ggml_op_name                   = ggml_op_name_string.c_str();
-
-    std::string graph_name;
-    ggmlqnn_get_opkey_from_op(op, graph_name);
-
-    qnn_perf op_perf(graph_name);
-    op_perf.start();
-
-    bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == QNN_BACKEND_NPU;
-    if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
-        //retrieve computational resource from cached QNN graph
-        qnn_singlenode_res_t & graph_item = ctx->qnn_singlenode_graph_map[graph_name];
-        graph_handle                      = std::get<0>(graph_item);
-        qnn_ptensors_t & ptensors         = std::get<1>(graph_item);
-        p_tensor0  = ptensors[0];
-        if (2 == input_param_count) {
-            p_tensor1 = ptensors[1];
-            p_tensor2 = ptensors[2];
-        } else {
-            //now p_tensor1 is nullptr
-            p_tensor2 = ptensors[1];
-        }
-    } else {
-        GGML_ASSERT(instance->get_device_id() == ctx->device);
-        GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str());
-        //create QNN graph
-        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device),
-                                         g_qnn_params.vtcm_size_in_mb,
-                                         g_qnn_params.hvx_threads);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n", graph_name.c_str(), error);
-            return;
-        }
-        graph_handle = instance->get_qnn_graph_handle();
-
-        GGMLQNN_LOG_DEBUG("graph_handle %p", graph_handle);
-        //create computational tensor
-        p_tensor0 = ggmlqnn_create_compute_tensor(instance, graph_handle, src0, QNN_TENSOR_TYPE_APP_WRITE);
-        if (2 == input_param_count) {
-            p_tensor1 = ggmlqnn_create_compute_tensor(instance, graph_handle, src1, QNN_TENSOR_TYPE_APP_WRITE);
-        }
-        p_tensor2 = ggmlqnn_create_compute_tensor(instance, graph_handle, dst, QNN_TENSOR_TYPE_APP_READ);
-
-        //compose QNN graph
-        qnn_tensors_t input_tensors;
-        input_tensors.reserve(input_param_count);
-        input_tensors.push_back(*p_tensor0);
-        if (2 == input_param_count) {
-            input_tensors.push_back(*p_tensor1);
-        }
-        Qnn_Tensor_t output_tensors[] = {
-                *p_tensor2
-        };
-        Qnn_OpConfig_t op_config = ggmlqnn_create_op_config(ggml_op_name,
-                                                            QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                            qnn_op_name, nullptr, 0,
-                                                            input_tensors.data(),
-                                                            input_param_count, output_tensors,
-                                                            1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, op_config));
-        //finalize QNN graph
-        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
-
-        //cache QNN graph
-        qnn_ptensors_t qnn_elementwise_tensors;
-        qnn_elementwise_tensors.reserve(input_param_count + 1);
-
-        qnn_elementwise_tensors.push_back(p_tensor0);
-        if (2 == input_param_count) {
-            qnn_elementwise_tensors.push_back(p_tensor1);
-        }
-        qnn_elementwise_tensors.push_back(p_tensor2);
-        auto graph_item = std::make_tuple(graph_handle, qnn_elementwise_tensors);
-        ctx->qnn_singlenode_graph_map[graph_name] = graph_item;
-    }
-
-    if (enable_npu_rpc) {
-        uint8_t * qnn_buffer_0 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(
-                                     QNN_VER_PTR(*p_tensor0)->memHandle));
-        GGMLQNN_LOG_DEBUG("qnn_rpcbuffer_0 = %p\n", qnn_buffer_0);
-        if (nullptr != qnn_buffer_0) {
-            memcpy(qnn_buffer_0, src0->data, ggml_nbytes(src0));
-        }
-
-        if (2 == input_param_count) {
-            uint8_t * qnn_buffer_1 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(
-                                         QNN_VER_PTR(*p_tensor1)->memHandle));
-            GGMLQNN_LOG_DEBUG("qnn_rpcbuffer_1 = %p\n", qnn_buffer_1);
-            if (nullptr != qnn_buffer_1) {
-                memcpy(qnn_buffer_1, src1->data, ggml_nbytes(src1));
-            }
-        }
-    } else {
-        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
-        if (2 == input_param_count) {
-            QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-        }
-        QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
-    }
-
-    qnn_tensors_t input_tensors;
-    input_tensors.reserve(input_param_count);
-    input_tensors.push_back(*p_tensor0);
-    if (2 == input_param_count) {
-        input_tensors.push_back(*p_tensor1);
-    }
-    Qnn_Tensor_t output_tensors[] = {
-            *p_tensor2
-    };
-    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                        input_tensors.data(), input_param_count,
-                                                        output_tensors, 1,
-                                                        nullptr, nullptr));
-    if (enable_npu_rpc) {
-        uint8_t * qnn_buffer_2 = static_cast<uint8_t *>(instance->get_rpcmem_from_memhandle(QNN_VER_PTR(*p_tensor2)->memHandle));
-        if (nullptr != qnn_buffer_2) {
-            memcpy(dst->data, qnn_buffer_2, ggml_nbytes(dst));
-        }
-    }
-
-    op_perf.info();
+static void ggml_backend_hexagon_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+    ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context;
+    memset(ctx->buffer, value, ctx->buffer_size);
 }
 
-/*
- * this function is AI-assisted code from Grok 3 for purpose of offload 4d matrix mulmat to QNN backend
- * various UT has verified and succeed but failed in CT of test-backend-ops
- *
- * the logic of ggmlqnn_compute_mul_mat_4d is similar to ggmlqnn_compute_mul_mat but much more complicated
- * than ggmlqnn_compute_mul_mat, so it's a standalone function.
- * it will be combined with ggmlqnn_compute_mul_mat in the future
- */
-static void ggmlqnn_compute_mul_mat_4d(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
-    Qnn_ErrorHandle_t error     = QNN_SUCCESS;
-    bool graph_initialized      = false;
-    qnn_perf op_perf            = qnn_perf("ggmlqnn_compute_mul_mat_4d");
-    qnn_instance * instance     = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
-
-    const ggml_tensor * src0 = op->src[0];
-    const ggml_tensor * src1 = op->src[1];
-    ggml_tensor * dst        = op;
-
-    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-    GGML_ASSERT(ggml_n_dims(src0) == 4 && ggml_n_dims(src1) == 4);
-    op_perf.start();
-
-    std::string graph_name;
-    ggmlqnn_get_opkey_from_op(op, graph_name);
-    GGMLQNN_LOG_DEBUG("graph name %s\n", graph_name.c_str());
-
-    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
-
-    Qnn_GraphHandle_t graph_handle  = nullptr;
-    Qnn_Tensor_t * p_tensor0        = nullptr;
-    Qnn_Tensor_t * p_reshape0_out   = nullptr;
-    Qnn_Tensor_t * p_tile0_out      = nullptr;
-    Qnn_Tensor_t * p_tensor1        = nullptr;
-    Qnn_Tensor_t * p_permute1_out   = nullptr;
-    Qnn_Tensor_t * p_reshape1_out   = nullptr;
-    Qnn_Tensor_t * p_matmul_out     = nullptr;
-    Qnn_Tensor_t * p_reshape2_out   = nullptr;
-
-    if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
-        graph_initialized = true;
-        qnn_singlenode_res_t & graph_item   = ctx->qnn_singlenode_graph_map[graph_name];
-        graph_handle                        = std::get<0>(graph_item);
-        qnn_ptensors_t & tensors            = std::get<1>(graph_item);
-        p_tensor0                           = tensors[0];
-        p_reshape0_out                      = tensors[1];
-        p_tile0_out                         = tensors[2];
-        p_tensor1                           = tensors[3];
-        p_permute1_out                      = tensors[4];
-        p_reshape1_out                      = tensors[5];
-        p_matmul_out                        = tensors[6];
-        p_reshape2_out                      = tensors[7];
-    } else {
-        CHECK_QNN_API(error, qnn_raw_interface.graphCreate(instance->get_qnn_context_handle(), graph_name.c_str(), NULL, &graph_handle));
-
-        // Define dimensions
-        uint32_t K = src0->ne[0];               // Inner dimension
-        uint32_t M = src0->ne[1];               // Rows of src0
-        uint32_t N = src1->ne[1];               // Columns of src1
-        uint32_t B0 = src0->ne[2] * src0->ne[3]; // src0 batch
-        uint32_t B1 = src1->ne[2] * src1->ne[3]; // src1 batch (drives output)
-
-        // Validate K only
-        GGML_ASSERT(src0->ne[0] == src1->ne[0]); // K must match
-
-        // src0: [K, M, H0, B0] -> QNN: [B0, H0, M, K]
-        uint32_t src0_dims[] = {static_cast<uint32_t>(src0->ne[3]), static_cast<uint32_t>(src0->ne[2]),
-                                static_cast<uint32_t>(src0->ne[1]), static_cast<uint32_t>(src0->ne[0])
-        };
-        p_tensor0 = ggmlqnn_create_general_tensor(instance, graph_handle, src0, "input0",
-                        QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
-                        src0_dims, nullptr, 0);
-
-        // Reshape src0 to [B0, M, K]
-        uint32_t reshape0_out_dims[] = {B0, M, K};
-        p_reshape0_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "reshape0_out",
-                             QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                             reshape0_out_dims, nullptr, 0);
-
-        Qnn_Tensor_t reshape0_inputs[]  = {*p_tensor0};
-        Qnn_Tensor_t reshape0_outputs[] = {*p_reshape0_out};
-        Qnn_OpConfig_t reshape0_op      = ggmlqnn_create_op_config("reshape0", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                              QNN_OP_RESHAPE, nullptr, 0,
-                                              reshape0_inputs, 1, reshape0_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape0_op));
-
-        // Tile src0 to match B1: [B0, M, K] -> [B1, M, K]
-        uint32_t tile0_out_dims[] = {B1, M, K};
-        p_tile0_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "tile0_out",
-                          QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                          tile0_out_dims, nullptr, 0);
-
-        uint32_t tile_multiples[] = {B1 / B0, 1, 1};
-        uint32_t tile_dims[] = {3};
-        Qnn_Tensor_t * p_tile_multiples = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "tile_multiples",
-                                              QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
-                                              tile_dims, tile_multiples, sizeof(tile_multiples));
-
-        Qnn_Param_t tile_params[]       = {{QNN_PARAMTYPE_TENSOR, "multiples", .tensorParam = *p_tile_multiples}};
-        Qnn_Tensor_t tile0_inputs[]     = {*p_reshape0_out};
-        Qnn_Tensor_t tile0_outputs[]    = {*p_tile0_out};
-        Qnn_OpConfig_t tile0_op         = ggmlqnn_create_op_config("tile0", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                              QNN_OP_TILE, tile_params, 1,
-                                              tile0_inputs, 1, tile0_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, tile0_op));
-
-        // src1: [N, K, H1, B1] -> QNN: [B1, H1, N, K]
-        uint32_t src1_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]),
-                                static_cast<uint32_t>(src1->ne[1]), static_cast<uint32_t>(src1->ne[0])
-        };
-        p_tensor1 = ggmlqnn_create_general_tensor(instance, graph_handle, src1, "input1",
-                        QNN_TENSOR_TYPE_APP_WRITE, QNN_DATATYPE_FLOAT_32, 4,
-                        src1_dims, nullptr, 0);
-
-
-        // Permute src1 to [B1, H1, K, N]
-        uint32_t perm_data[] = {0, 1, 3, 2};
-        uint32_t perm_dims[] = {4};
-        Qnn_Tensor_t * p_perm = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "perm",
-                                    QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
-                                    perm_dims, perm_data, sizeof(perm_data));
-
-        uint32_t permute1_out_dims[] = {static_cast<uint32_t>(src1->ne[3]), static_cast<uint32_t>(src1->ne[2]),
-                                        static_cast<uint32_t>(src1->ne[0]), static_cast<uint32_t>(src1->ne[1])
-        };
-        p_permute1_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "permute1_out",
-                             QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
-                             permute1_out_dims, nullptr, 0);
-
-        Qnn_Param_t permute1_params[]   = {{QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_perm}};
-        Qnn_Tensor_t permute1_inputs[]  = {*p_tensor1};
-        Qnn_Tensor_t permute1_outputs[] = {*p_permute1_out};
-        Qnn_OpConfig_t permute1_op      = ggmlqnn_create_op_config("permute1", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                              QNN_OP_TRANSPOSE, permute1_params, 1,
-                                              permute1_inputs, 1, permute1_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, permute1_op));
-
-        // Reshape src1 to [B1, K, N]
-        uint32_t reshape1_out_dims[] = {B1, K, N};
-        p_reshape1_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "reshape1_out",
-                           QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                           reshape1_out_dims, nullptr, 0);
-
-        Qnn_Tensor_t reshape1_inputs[]  = {*p_permute1_out};
-        Qnn_Tensor_t reshape1_outputs[] = {*p_reshape1_out};
-        Qnn_OpConfig_t reshape1_op      = ggmlqnn_create_op_config("reshape1", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                              QNN_OP_RESHAPE, nullptr, 0,
-                                              reshape1_inputs, 1, reshape1_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape1_op));
-
-        // MatMul: [B1, M, K] x [B1, K, N] -> [B1, M, N]
-        uint32_t matmul_out_dims[] = {B1, M, N};
-        p_matmul_out = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "matmul_out",
-                           QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 3,
-                           matmul_out_dims, nullptr, 0);
-
-        Qnn_Tensor_t matmul_inputs[]    = {*p_tile0_out, *p_reshape1_out};
-        Qnn_Tensor_t matmul_outputs[]   = {*p_matmul_out};
-        Qnn_OpConfig_t matmul_op        = ggmlqnn_create_op_config("matmul", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                              QNN_OP_MAT_MUL, nullptr, 0,
-                                              matmul_inputs, 2, matmul_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, matmul_op));
-
-        // Output: [N, M, H1, B1] -> QNN: [B1, H1, M, N]
-        uint32_t reshape2_out_dims[] = {static_cast<uint32_t>(dst->ne[3]), static_cast<uint32_t>(dst->ne[2]),
-                                        static_cast<uint32_t>(dst->ne[1]), static_cast<uint32_t>(dst->ne[0])
-        };
-        p_reshape2_out = ggmlqnn_create_general_tensor(instance, graph_handle, dst, "output",
-                             QNN_TENSOR_TYPE_APP_READ, QNN_DATATYPE_FLOAT_32, 4,
-                             reshape2_out_dims, nullptr, 0);
-
-        Qnn_Tensor_t reshape2_inputs[]  = {*p_matmul_out};
-        Qnn_Tensor_t reshape2_outputs[] = {*p_reshape2_out};
-        Qnn_OpConfig_t reshape2_op      = ggmlqnn_create_op_config("reshape2", QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                              QNN_OP_RESHAPE, nullptr, 0,
-                                              reshape2_inputs, 1, reshape2_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, reshape2_op));
+static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
+        /* .free_buffer     = */ ggml_backend_hexagon_buffer_free_buffer,
+        /* .get_base        = */ ggml_backend_hexagon_buffer_get_base,
+        /* .init_tensor     = */ ggml_backend_hexagon_buffer_init_tensor,
+        /* .memset_tensor   = */ ggml_backend_hexagon_buffer_memset_tensor,
+        /* .set_tensor      = */ ggml_backend_hexagon_buffer_set_tensor,
+        /* .get_tensor      = */ ggml_backend_hexagon_buffer_get_tensor,
+        /* .cpy_tensor      = */ ggml_backend_hexagon_buffer_cpy_tensor,
+        /* .clear           = */ ggml_backend_hexagon_buffer_clear,
+        /* .reset           = */ nullptr,
+};
 
-        // Finalize
-        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
+static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    return "qnn-buffer";
+}
 
-        // Cache
-        qnn_ptensors_t ggml_op_mulmat_tensors = {p_tensor0, p_reshape0_out, p_tile0_out, p_tensor1,
-                                                 p_permute1_out, p_reshape1_out, p_matmul_out, p_reshape2_out
-        };
-        ctx->qnn_singlenode_graph_map[graph_name] = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
+static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
+           ggml_backend_buffer_type_t buft, size_t size) {
+    ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context;
+
+    size_t size_page = 0;
+#if defined(__ANDROID__) || defined(__linux__)
+    size_page = sysconf(_SC_PAGESIZE);
+#else
+    SYSTEM_INFO systeminfo;
+    GetSystemInfo(&systeminfo);
+    size_page = systeminfo.dwPageSize;
+#endif
+    size_t size_aligned = size;
+    if ((size_aligned % size_page) != 0) {
+        size_aligned += (size_page - (size_aligned % size_page));
+    }
+    ctx->buffer         = ggml_aligned_malloc(size_aligned);
+    ctx->buffer_size    = size_aligned;
+    if (nullptr == ctx->buffer) {
+        GGMLHEXAGON_LOG_WARN("%s: failed to allocate %d MiB\n", __func__, size / (1 << 20));
+        return nullptr;
     }
 
-    // Execute
-    QNN_VER_PTR(*p_tensor0)->clientBuf      = {src0->data, static_cast<uint32_t>(ggml_nbytes(src0))};
-    QNN_VER_PTR(*p_tensor1)->clientBuf      = {src1->data, static_cast<uint32_t>(ggml_nbytes(src1))};
-    QNN_VER_PTR(*p_reshape2_out)->clientBuf = {dst->data, static_cast<uint32_t>(ggml_nbytes(dst))};
+    return ggml_backend_buffer_init(buft, ggml_backend_hexagon_buffer_interface, ctx, size);
+}
 
-    Qnn_Tensor_t input_tensors[]    = {*p_tensor0, *p_tensor1};
-    Qnn_Tensor_t output_tensors[]   = {*p_reshape2_out};
-    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle, input_tensors, 2, output_tensors, 1, NULL, NULL));
+static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    return 32;
+}
 
-    op_perf.info();
+static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+
+    return (2 * (1 << 29));
 }
 
-/*
- * @brief performs matrix multiplication with FP32 & quantized weights and floating-point inputs
- *        using the QNN backend. this function performs matrix multiplication of the input tensor
- *        `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
- *        and stores the result in the destination tensor `dst`.
- *
-         there are two key-points in properly handling how to offload mulmat to the QNN backend in ggml-qnn
-         1. transpose
-            a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from:
-            struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
-            which like this:
-            +---+---+
-            | 0 | 1 |
-            +---+---+
-            | 2 | 3 |
-            +---+---+
-            | 4 | 5 |
-            +---+---+
-            with
-                ne[0] = 2
-                ne[1] = 3
-            there are different dimension order between ggml tensor and qnn tensor
+static bool ggml_backend_hexagon_buffer_is_host(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    return true;
+}
 
-          2. QNN's MatMul can only support input tensors with rank >= 2
+static const char * ggml_backend_hexagon_name(ggml_backend_t backend) {
+    ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *) backend->context;
+    return g_hexagon_mgr[ctx->device].name;
+}
 
-             in the all, there is gap between ggml mulmat and QNN mulmat,we need to perform a transpose
-             operation when offloading mulmat to QNN backend. this implementation will handle transpose
-             in func ggmlqnn_compute_create_general_tensor()
+static void ggml_backend_hexagon_free(ggml_backend_t backend) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)backend->context;
 
- * @param ctx     the context of backend
- * @param op      the destination tensor where the result of the matrix multiplication will be stored.
- *
- * @note the logic of ggmlqnn_compute_mul_mat is similar to ggmlqnn_compute_op_two_tensors but much more complicated
- *       than ggmlqnn_compute_op_two_tensors. so it's a standalone function. accordingly, this is another
- *       typical skeleton for offload other ggml ops to QNN backend. MUL_MAT take most of the compute
- *       time (about 95%).so to speed up llama inference, should focus on this func. there are three kinds
- *       of MUL_MAT to compute:
- *       mul_mat_f32:     both src0 and src1 are F32, this will be naturally handled in QNN backend
- *       mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
- *       mul_mat_q_f32:   src0 is quantized (Q4_0, Q4_1, Q6_K...)
- *                        and src1 is F32, src0 -> f32 in src0', then src0' * src1
-*/
-static void ggmlqnn_compute_mul_mat(ggml_backend_qnn_context * ctx, ggml_tensor * op) {
-    Qnn_ErrorHandle_t error                     = QNN_SUCCESS;
-    qnn_instance * instance                     = nullptr;
-    Qnn_GraphHandle_t graph_handle              = nullptr;
-    Qnn_Tensor_t * p_tensor0                    = nullptr;
-    Qnn_Tensor_t * p_tensor1                    = nullptr;
-    Qnn_Tensor_t * p_tensor2                    = nullptr;
-    Qnn_Tensor_t * p_param_tensor               = nullptr;
-    Qnn_Tensor_t * p_tensor2_transpose          = nullptr;
-    const ggml_tensor * src0                    = op->src[0];
-    const ggml_tensor * src1                    = op->src[1];
-    ggml_tensor       * dst                     = op;
+    qnn_instance * instance = (qnn_instance*)g_hexagon_mgr[ctx->device].instance;
+    if (instance != nullptr) {
+        std::map<std::string, qnn_singlenode_res_t>::iterator singlenode_graph_it;
+        for (singlenode_graph_it = ctx->qnn_singlenode_graph_map.begin();
+             singlenode_graph_it != ctx->qnn_singlenode_graph_map.end(); singlenode_graph_it++) {
+            auto & graph_res = singlenode_graph_it->second;
+            Qnn_GraphHandle_t & graph_handle    = std::get<0>(graph_res);
+            qnn_ptensors_t    & ptensors        = std::get<1>(graph_res);
+            for (auto tensor_it = ptensors.begin(); tensor_it != ptensors.end(); ++tensor_it) {
+                ggmlqnn_free_qnntensor(*tensor_it);
+            }
+            GGML_UNUSED(graph_handle);
+            GGMLHEXAGON_LOG_DEBUG("clean up graph:%s", singlenode_graph_it->first.c_str());
+        }
+        ctx->qnn_singlenode_graph_map.clear();
 
-    GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
-    instance                                    = ctx->instance;
-    QNN_INTERFACE_VER_TYPE qnn_raw_interface    = ctx->raw_interface;
+        instance->qnn_finalize();
+        delete instance;
+        g_hexagon_mgr[ctx->device].instance = nullptr;
+    }
 
-    const enum ggml_type src0_type              = src0->type;
-    const uint32_t src0_rank                    = ggml_n_dims(src0);
-    const uint32_t src1_rank                    = ggml_n_dims(src1);
+    if (g_hexagon_mgr[ctx->device].backend != nullptr) {
+        if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+            ggmlhexagon_close_cdsp(ctx);
+        }
 
-    ggmlqnn_print_tensors_info(__func__, ctx, src0, src1, dst);
+        delete backend;
+        g_hexagon_mgr[ctx->device].backend = nullptr;
+        ggmlhexagon_print_running_timestamp(ctx);
+    }
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+}
 
-    std::string graph_name;
-    ggmlqnn_get_opkey_from_op(op, graph_name);
+static enum ggml_status ggmlhexagon_backend_graph_compute_general(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+    enum ggml_status result         = GGML_STATUS_SUCCESS;
+    ggml_backend_hexagon_context * ctx  = (ggml_backend_hexagon_context *)backend->context;
+    GGML_UNUSED(ctx);
 
-    qnn_perf op_perf(graph_name);
-    op_perf.start();
+    for (int i = 0; i < cgraph->n_nodes; i++) {
+        ggml_tensor * node = cgraph->nodes[i];
+        if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE
+            || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW
+            || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
+            continue;
+        }
+        bool ok = ggmlhexagon_compute_forward(backend, node);
+        if (!ok) {
+            GGMLHEXAGON_LOG_DEBUG("%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+        }
+    }
 
-    GGML_ASSERT(src0_rank == src1_rank);
-    GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation, make QNN SDK happy
-    if (4 == src0_rank) {
-        return ggmlqnn_compute_mul_mat_4d(ctx, op);
+    return result;
+}
+
+static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev) {
+    struct ggml_backend_hexagon_context *ctx = static_cast<ggml_backend_hexagon_context *>(dev->context);
+    if (nullptr == ctx) {
+        GGMLHEXAGON_LOG_ERROR("pls check why ctx is null");
+        return "unknown";
     }
+    return ctx->name;
+}
 
-    void * wdata                                = ggmlqnn_type_trait(ctx, op);
-    const size_t desired_size                   = ctx->desired_size;
+static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev_t dev) {
+    struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(dev->context);
+    static char hexagon_device_desc[GGMLHEXAGON_TMPBUF_LEN];
+    if (nullptr == ctx) {
+        GGMLHEXAGON_LOG_ERROR("pls check why ctx is null");
+        return "unknown";
+    }
 
-    if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
-        //retrieve computational resource from cached QNN graph
-        qnn_singlenode_res_t & graph_item = ctx->qnn_singlenode_graph_map[graph_name];
-        graph_handle = std::get<0>(graph_item);
-        qnn_ptensors_t &tensors = std::get<1>(graph_item);
-        p_tensor0 = tensors[0];
-        p_tensor1 = tensors[1];
-        p_tensor2 = tensors[2];
-        p_param_tensor = tensors[3];
-        p_tensor2_transpose = tensors[4];
+    if (0 == strncmp(ctx->name, "qnn-npu", 7)) {
+        const char * soc_info = ggmlhexagon_get_socmodel_desc(ctx->socinfo.soc_model);
+        const char * htp_arch = ggmlhexagon_get_htparch_desc(ctx->socinfo.htp_arch);
+        std::string dev_desc = std::string(ctx->desc)
+                + std::string(soc_info) + "_" + std::string(htp_arch)
+                + "," + std::string(ctx->socinfo.soc_desc);
+        memset(hexagon_device_desc, 0, GGMLHEXAGON_TMPBUF_LEN);
+        memcpy(hexagon_device_desc, dev_desc.c_str(), strlen(dev_desc.c_str()));
+        return hexagon_device_desc;
     } else {
-        //create QNN graph
-        GGMLQNN_LOG_INFO("graph name %s", graph_name.c_str());
-        error = instance->init_qnn_graph(graph_name, static_cast<QNNBackend>(ctx->device),
-                                         g_qnn_params.vtcm_size_in_mb,
-                                         g_qnn_params.hvx_threads);
-        if (QNN_SUCCESS != error) {
-            GGMLQNN_LOG_WARN("can't create qnn graph handle with graph name %s, error = %d\n",
-                             graph_name.c_str(), error);
-            return;
-        }
-        graph_handle = instance->get_qnn_graph_handle();
-
-        //create computational tensor
-        p_tensor0 = ggmlqnn_create_general_tensor(instance, graph_handle, src0, nullptr,
-                                                  QNN_TENSOR_TYPE_APP_WRITE,
-                                                  QNN_DATATYPE_FLOAT_32, src0_rank,
-                                                  nullptr, nullptr, 0);
-        p_tensor1 = ggmlqnn_create_general_tensor(instance, graph_handle, src1, nullptr,
-                                                  QNN_TENSOR_TYPE_APP_WRITE,
-                                                  QNN_DATATYPE_FLOAT_32, src0_rank,
-                                                  nullptr, nullptr, 0);
-        p_tensor2 = ggmlqnn_create_general_tensor(instance, graph_handle, dst, nullptr,
-                                                  QNN_TENSOR_TYPE_APP_READ,
-                                                  QNN_DATATYPE_FLOAT_32, src0_rank,
-                                                  nullptr, nullptr, 0);
+        return ctx->desc;
+    }
+}
 
-        //create param tensor for offload 2d/3d/4d matrix multiplication
-        const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
-                {0},
-                {1, 0},
-                {0, 2, 1},
-                {0, 1, 3, 2},
-        };
-        uint32_t param_tensor_dims[1] = {src0_rank};
-        p_param_tensor = ggmlqnn_create_general_tensor(instance, graph_handle, nullptr, "param",
-                                                       QNN_TENSOR_TYPE_STATIC,
-                                                       QNN_DATATYPE_UINT_32, 1,
-                                                       param_tensor_dims,
-                                                       (void *) (param_tensor_data[src0_rank - 1]),
-                                                       src0_rank * sizeof(uint32_t));
+static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+    struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(dev->context);
+    if ((nullptr == ctx) || (ctx->device > HEXAGON_BACKEND_GGML)) {
+        GGMLHEXAGON_LOG_ERROR("pls check params");
+        *free = 0;
+        *total = 0;
+    }
 
-        //create transpose tensor
-        p_tensor2_transpose = ggmlqnn_create_general_tensor(instance, graph_handle, dst,
-                                                            "transpose",
-                                                            QNN_TENSOR_TYPE_NATIVE,
-                                                            QNN_DATATYPE_FLOAT_32, src0_rank,
-                                                            nullptr, nullptr, 0, true);
+    if (HEXAGON_BACKEND_QNNCPU == ctx->device || HEXAGON_BACKEND_GGML == ctx->device) {
+        *total = ggmlhexagon_get_system_total_memory_in_bytes();
+        *free = ggmlhexagon_get_system_free_memory_in_bytes();
+    } else if (HEXAGON_BACKEND_QNNGPU == ctx->device) {
+        //TODO: probe GPU info in Qualcomm Adreno GPU
+        *total = ggmlhexagon_get_system_total_memory_in_bytes();
+        *free = ggmlhexagon_get_system_free_memory_in_bytes();
+    } else if (HEXAGON_BACKEND_QNNNPU == ctx->device) {
+        size_t rpc_ion_memsize = 0;
+        size_t rpc_ion_usage   = 0;
+        if (HWACCEL_CDSP != g_hexagon_appcfg.hwaccel_approach) {
+            rpc_ion_memsize = ctx->instance->get_rpcmem_capacity();
+            rpc_ion_usage   = ctx->instance->get_rpcmem_usage();
+        } else {
+            ggmlhexagon_probe_dspinfo(ctx, &rpc_ion_memsize);
+        }
+        GGMLHEXAGON_LOG_DEBUG("rpc memsize %d M", rpc_ion_memsize);
+        GGMLHEXAGON_LOG_DEBUG("rpc usage %d M", rpc_ion_usage);
+        *total = rpc_ion_memsize * (1 << 20);
+        *free = (rpc_ion_memsize - rpc_ion_usage) * (1 << 20);
+    }
+}
 
-        //compose QNN graph: add mulmat node
-        Qnn_Param_t out_0_params[] = {
-                {QNN_PARAMTYPE_SCALAR, QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, .scalarParam = {
-                        QNN_DATATYPE_BOOL_8, .bool8Value = 1}}};
-        Qnn_Tensor_t out_0_inputs[] = {*p_tensor0, *p_tensor1};
-        Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
-        Qnn_OpConfig_t out_0 = ggmlqnn_create_op_config("mulmat_opconfig",
-                                                        QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                        QNN_OP_MAT_MUL, out_0_params, 1,
-                                                        out_0_inputs, 2, out_0_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, out_0));
+static enum ggml_backend_dev_type ggml_backend_hexagon_device_get_type(ggml_backend_dev_t dev) {
+    struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(dev->context);
+    if (HEXAGON_BACKEND_QNNCPU == ctx->device)
+        return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+    else if (HEXAGON_BACKEND_QNNGPU == ctx->device)
+        return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+    else if (HEXAGON_BACKEND_QNNNPU == ctx->device)
+        return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+    else
+        return GGML_BACKEND_DEVICE_TYPE_CPU;
+}
 
-        //compose QNN graph: add transpose node
-        Qnn_Param_t out_trans1_0_params[] = {
-                {QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_param_tensor}};
-        Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
-        Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
-        Qnn_OpConfig_t out_trans1_0 = ggmlqnn_create_op_config("mulmat_transpose_opconfig",
-                                                               QNN_OP_PACKAGE_NAME_QTI_AISW,
-                                                               QNN_OP_TRANSPOSE,
-                                                               out_trans1_0_params, 1,
-                                                               out_trans1_0_inputs, 1,
-                                                               out_trans1_0_outputs, 1);
-        CHECK_QNN_API(error, qnn_raw_interface.graphAddNode(graph_handle, out_trans1_0));
+static void ggml_backend_hexagon_device_get_props(ggml_backend_dev_t dev,
+                                              struct ggml_backend_dev_props * props) {
+    props->name        = ggml_backend_hexagon_device_get_name(dev);
+    props->description = ggml_backend_hexagon_device_get_description(dev);
+    props->type        = ggml_backend_hexagon_device_get_type(dev);
+    ggml_backend_hexagon_device_get_memory(dev, &props->memory_free, &props->memory_total);
+    props->caps = {
+            /* .async                 = */ false,
+            /* .host_buffer           = */ false,
+            /* .buffer_from_host_ptr  = */ true,
+            /* .events                = */ false,
+    };
+}
 
-        //finalize QNN graph
-        CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, nullptr, nullptr));
+static ggml_backend_t ggml_backend_hexagon_device_init_backend(ggml_backend_dev_t dev, const char * params) {
+    GGML_UNUSED(dev);
+    GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
+    size_t dev_index = 0;
 
-        //cache QNN graph
-        qnn_ptensors_t ggml_op_mulmat_tensors;
-        ggml_op_mulmat_tensors.reserve(5);
-        ggml_op_mulmat_tensors.push_back(p_tensor0);
-        ggml_op_mulmat_tensors.push_back(p_tensor1);
-        ggml_op_mulmat_tensors.push_back(p_tensor2);
-        ggml_op_mulmat_tensors.push_back(p_param_tensor);
-        ggml_op_mulmat_tensors.push_back(p_tensor2_transpose);
-        auto graph_item = std::make_tuple(graph_handle, ggml_op_mulmat_tensors);
-        ctx->qnn_singlenode_graph_map[graph_name] = graph_item;
-    }
+    //case-1: test-backend-ops or other similar scenairo: calling ggml_backend_dev_init(dev, reinterpret_cast<const char *>(i)) directly in user's code
+    ggmlhexagon_load_cfg();
+    GGMLHEXAGON_LOG_DEBUG("user's specified hexagon_backend in cfgfile = %d", g_hexagon_appcfg.hexagon_backend);
+    GGMLHEXAGON_LOG_DEBUG("user's sepcified qnn runtime lib path in cfgfile = %s", g_hexagon_appcfg.runtimelib_path);
 
-    if (src0_type != GGML_TYPE_F32) {
-        QNN_VER_PTR(*p_tensor0)->clientBuf = {wdata, static_cast<uint32_t>(desired_size)};
+    if (nullptr == params) {
+        GGMLHEXAGON_LOG_DEBUG("program specified param is nullptr");
+        dev_index = (g_hexagon_appcfg.hexagon_backend > 0) ? g_hexagon_appcfg.hexagon_backend : 0;
+        if (dev_index >= GGML_HEXAGON_MAX_DEVICES) {
+            GGMLHEXAGON_LOG_INFO("assume the default ggml backend");
+            return nullptr;
+        }
     } else {
-        QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggmlqnn_get_tensor_data_size(src0)};
+        GGMLHEXAGON_LOG_INFO("program specified param is not nullptr");
+        //user's program calling ggml_backend_hexagon_device_init_backend directly
+        dev_index = (int)(intptr_t)params;
+        g_hexagon_appcfg.hexagon_backend = dev_index;
+        GGMLHEXAGON_LOG_INFO("program specified dev_index %d\n", dev_index);
     }
-    QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggmlqnn_get_tensor_data_size(src1)};
-    QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggmlqnn_get_tensor_data_size(dst)};
+    GGMLHEXAGON_LOG_DEBUG("hexagon_backend=%d", dev_index);
+    ggml_backend_t hexagon_backend = ggml_backend_hexagon_init(dev_index, g_hexagon_appcfg.runtimelib_path);
+    GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__);
 
-    Qnn_Tensor_t tensor_inputs[] = {
-            *p_tensor0,
-            *p_tensor1
-    };
-    Qnn_Tensor_t tensor_outputs[] = {
-            *p_tensor2
-    };
-    CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
-                                                        tensor_inputs, 2,
-                                                        tensor_outputs, 1,
-                                                        nullptr, nullptr));
-    op_perf.info();
-}
+    return hexagon_backend;
 
-static void ggmlqnn_compute_repeat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
 }
 
-static void ggmlqnn_compute_div(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
+static ggml_backend_buffer_type_t ggml_backend_hexagon_buffer_type(size_t device_index) {
+    if (device_index >= GGML_HEXAGON_MAX_DEVICES) {
+        GGMLHEXAGON_LOG_DEBUG("ggml_backend_hexagon_buffer_type error: device_index:%d is out of range [0, %d]\n",
+                      device_index, GGML_HEXAGON_MAX_DEVICES - 1);
+        return nullptr;
+    }
 
-static void ggmlqnn_compute_leaky_relu(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_qnn = {
+            /* .iface   = */ {
+                                     /* .get_name         = */ ggml_backend_hexagon_buffer_type_name,
+                                     /* .alloc_buffer     = */ ggml_backend_hexagon_buffer_type_alloc_buffer,
+                                     /* .get_alignment    = */ ggml_backend_hexagon_buffer_type_get_alignment,
+                                     /* .get_max_size     = */ ggml_backend_hexagon_buffer_type_get_max_size,
+                                     /* .get_alloc_size   = */ nullptr,// defaults to ggml_nbytes
+                                     /* .is_host          = */ ggml_backend_hexagon_buffer_is_host
+                             },
+            /* .device  = */ nullptr,
+            /* .context = */ nullptr,
+    };
 
-static void ggmlqnn_compute_concat(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
+    return &ggml_backend_buffer_type_qnn;
 }
 
-static void ggmlqnn_compute_arange(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
+static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_buffer_type(ggml_backend_dev_t dev) {
+    ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *) dev->context;
+    return ggml_backend_hexagon_buffer_type(ctx->device);
 }
 
-static void ggmlqnn_compute_sqr(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
+static ggml_backend_buffer_t ggml_backend_hexagon_device_buffer_from_host_ptr(ggml_backend_dev_t dev,
+                                                void * ptr, size_t size, size_t max_tensor_size) {
+    return ggml_backend_cpu_buffer_from_ptr(ptr, size);
 
-static void ggmlqnn_compute_clamp(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
+    GGML_UNUSED(dev);
+    GGML_UNUSED(max_tensor_size);
 }
 
-static void ggmlqnn_compute_scale(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
+static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+    ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *) dev->context;
+    return (ggmlbackend_can_handle_op(ctx,op));
 }
 
-static void ggmlqnn_compute_argsort(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
+static bool ggml_backend_hexagon_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(dev);
+    return ggml_backend_buft_is_host(buft);
 }
 
-static void ggmlqnn_compute_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
+static struct ggml_backend_device_i ggml_backend_hexagon_device_interface = {
+        /* .get_name             = */ ggml_backend_hexagon_device_get_name,
+        /* .get_description      = */ ggml_backend_hexagon_device_get_description,
+        /* .get_memory           = */ ggml_backend_hexagon_device_get_memory,
+        /* .get_type             = */ ggml_backend_hexagon_device_get_type,
+        /* .get_props            = */ ggml_backend_hexagon_device_get_props,
+        /* .init_backend         = */ ggml_backend_hexagon_device_init_backend,
+        /* .get_buffer_type      = */ ggml_backend_hexagon_device_get_buffer_type,
+        /* .get_host_buffer_type = */ nullptr,
+        /* .buffer_from_host_ptr = */ ggml_backend_hexagon_device_buffer_from_host_ptr,
+        /* .supports_op          = */ ggml_backend_hexagon_device_supports_op,
+        /* .supports_buft        = */ ggml_backend_hexagon_device_supports_buft,
+        /* .offload_op           = */ nullptr,
+        /* .event_new            = */ nullptr,
+        /* .event_free           = */ nullptr,
+        /* .event_synchronize    = */ nullptr,
+};
 
-static void ggmlqnn_compute_group_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
+static ggml_backend_i ggml_backend_hexagon_interface = {
+        /* .get_name                = */ ggml_backend_hexagon_name,
+        /* .free                    = */ ggml_backend_hexagon_free,
+        /* .set_tensor_async        = */ nullptr,
+        /* .get_tensor_async        = */ nullptr,
+        /* .cpy_tensor_async        = */ nullptr,
+        /* .synchronize             = */ nullptr,
+        /* .graph_plan_create       = */ nullptr,
+        /* .graph_plan_free         = */ nullptr,
+        /* .graph_plan_update       = */ nullptr,
+        /* .graph_plan_compute      = */ nullptr,
+        /* .graph_compute           = */ nullptr,
+        /* .event_record            = */ nullptr,
+        /* .event_wait              = */ nullptr,
+};
 
-static void ggmlqnn_compute_acc(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
+//FIXME: this guid is not make sense
+static ggml_guid_t ggml_backend_hexagon_guid() {
+    static ggml_guid guid = {
+            0x1a, 0x2b, 0x3c, 0x4d, 0x5e, 0x6f, 0x70, 0x81,
+            0x92, 0xa3, 0xb4, 0xc5, 0xd6, 0xe7, 0xf8, 0x09
+    };
+    return &guid;
 }
 
-static void ggmlqnn_compute_sum_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
+bool ggml_backend_is_hexagon(ggml_backend_t backend) {
+    return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_hexagon_guid());
 }
 
-static void ggmlqnn_compute_upsample_nearest2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
+void ggml_backend_hexagon_set_n_threads(ggml_backend_t backend, int n_threads) {
+    GGML_ASSERT(ggml_backend_is_hexagon(backend));
 
-static void ggmlqnn_compute_pad(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
+    struct ggml_backend_hexagon_context * ctx = (struct ggml_backend_hexagon_context *)backend->context;
+    ctx->n_threads = n_threads;
 }
 
-static void ggmlqnn_compute_pool2d(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
+int ggml_backend_hexagon_get_device_count() {
+    return GGML_HEXAGON_MAX_DEVICES;
 }
 
-static void ggmlqnn_compute_dup(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-}
+struct ggml_backend_hexagon_reg_context {
+    std::vector<ggml_backend_dev_t> devices;
+};
 
-static void ggmlqnn_compute_rms_norm(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
+static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    return "ggml-hexagon";
 }
 
-static void ggmlqnn_compute_diag_mask(ggml_backend_qnn_context * ctx, ggml_tensor * dst, float value) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-    GGML_UNUSED(value);
+static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg) {
+    GGML_UNUSED(reg);
+    return GGML_HEXAGON_MAX_DEVICES;
 }
 
-static void ggmlqnn_compute_im2col(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
+static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+    GGML_UNUSED(reg);
+    GGML_UNUSED(index);
+
+    GGMLHEXAGON_LOG_DEBUG("index %d", index);
+    ggml_backend_hexagon_reg_context * ctx = (ggml_backend_hexagon_reg_context *)reg->context;
+    GGML_ASSERT(index < ctx->devices.size());
+    return ctx->devices[index];
 }
 
-static void ggmlqnn_compute_timestep_embedding(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
+static void * ggml_backend_hexagon_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+    GGML_UNUSED(reg);
+
+    if (nullptr == name)
+        return nullptr;
+
+    const char * slot_name =  "ggml_backend_set_n_threads";
+    if (0 == memcmp(name, slot_name, strlen(slot_name))) {
+        return (void *)ggml_backend_hexagon_set_n_threads;
+    }
+    return nullptr;
 }
 
-static void ggmlqnn_compute_cpy(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    ggmlqnn_compute_dup(ctx, dst);
+static const ggml_backend_reg_i ggml_backend_hexagon_reg_interface = {
+        /* .get_name          = */ ggml_backend_hexagon_reg_get_name,
+        /* .get_device_count  = */ ggml_backend_hexagon_reg_get_device_count,
+        /* .get_device        = */ ggml_backend_hexagon_reg_get_device,
+        /* .get_proc_address  = */ ggml_backend_hexagon_reg_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_hexagon_reg() {
+    static ggml_backend_reg reg;
+    static bool initialized = false;
+    GGMLHEXAGON_LOG_DEBUG("enter ggml_backend_hexagon_reg");
+
+    //case-2: normal scenario, such as llama-cli or UI applicaton
+    ggmlhexagon_load_cfg();
+    GGMLHEXAGON_LOG_INFO("hwaccel approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
+                     ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
+    GGMLHEXAGON_LOG_INFO("user's specified hexagon_backend=%d", g_hexagon_appcfg.hexagon_backend);
+    GGMLHEXAGON_LOG_INFO("user's specified runtime lib path=%s", g_hexagon_appcfg.runtimelib_path);
+    if (g_hexagon_appcfg.hexagon_backend >= GGML_HEXAGON_MAX_DEVICES) {
+        GGMLHEXAGON_LOG_INFO("assume default ggml backend");
+        GGMLHEXAGON_LOG_DEBUG("leave ggml_backend_hexagon_reg");
+        return nullptr;
+    }
+
+    {
+        static std::mutex mutex;
+        std::lock_guard<std::mutex> lock(mutex);
+        if (!initialized) {
+            ggml_backend_hexagon_reg_context * ctx = new ggml_backend_hexagon_reg_context;
+
+            for (int i = 0; i < ggml_backend_hexagon_get_device_count(); i++) {
+                ggml_backend_dev_t dev = new ggml_backend_device {
+                        /* .iface       = */ ggml_backend_hexagon_device_interface,
+                        /* .reg         = */ &reg,
+                        /* .context     = */ &g_hexagon_mgr[i]
+                };
+                ctx->devices.push_back(dev);
+            }
+
+            reg = ggml_backend_reg {
+                    /* .api_version = */ GGML_BACKEND_API_VERSION,
+                    /* .iface       = */ ggml_backend_hexagon_reg_interface,
+                    /* .context     = */ ctx
+            };
+        }
+
+        initialized = true;
+    }
+    GGMLHEXAGON_LOG_DEBUG("leave ggml_backend_hexagon_reg");
+
+    return &reg;
 }
 
-static void ggmlqnn_compute_softmax(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
+const char * ggml_backend_hexagon_get_devname(size_t dev_num) {
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        if (dev_num == HEXAGON_BACKEND_GGML)
+            return "ggml";
+        else
+            return "ggml-hexagon";
+    }
+
+    switch (dev_num) {
+        case HEXAGON_BACKEND_QNNCPU:
+            return "QNN-CPU";
+        case HEXAGON_BACKEND_QNNGPU:
+            return "QNN-GPU";
+        case HEXAGON_BACKEND_QNNNPU:
+            return "QNN-NPU";
+        case HEXAGON_BACKEND_GGML:
+            return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML
+        default:
+            return "unknown";
+    }
 }
 
-static void ggmlqnn_compute_get_rows(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
+static qnn_instance * ggmlqnn_init_qnn_instance(size_t device, const char * qnn_lib_path) {
+    int result = 0;
+    GGMLHEXAGON_LOG_INFO("hwaccel approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
+                     ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
+
+    qnn_instance * instance = nullptr;
+    instance = new qnn_instance(qnn_lib_path, g_hexagon_mgr[device].lib, "");
+    result = instance->qnn_init(nullptr);
+    if (0 != result) {
+        GGMLHEXAGON_LOG_WARN("init qnn subsystem failed with qnn backend %s, pls check why\n",
+                         ggml_backend_hexagon_get_devname(device));
+        delete instance;
+        return nullptr;
+    }
+    qnn_interface qnn_interface = instance->get_qnn_interface();
+    if (!qnn_interface.is_loaded()) {
+        GGMLHEXAGON_LOG_WARN("qnn subsystem failure\n");
+        delete instance;
+        return nullptr;
+    }
+
+    std::string device_name = ggml_backend_hexagon_get_devname(device);
+    GGMLHEXAGON_LOG_INFO("qnn device name %s", device_name.c_str());
+    g_hexagon_mgr[device].instance = instance;
+    g_hexagon_mgr[device].raw_interface = instance->get_qnn_raw_interface();
+    g_hexagon_mgr[device].raw_system_interface = instance->get_qnn_raw_system_interface();
+
+    return instance;
 }
 
-static void ggmlqnn_compute_rope(ggml_backend_qnn_context * ctx, ggml_tensor * dst) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
+/**
+ *
+ * @param device            0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU/HEXAGON_BACKEND_CDSP
+ * @param qnn_lib_path      QNN binrary runtime library path, such as "/data/local/tmp/" on Android or specified in JNI layer
+ * @return
+ */
+ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * qnn_lib_path) {
+    int result = 0;
+
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
+    //case-3: calling ggml_backend_hexagon_init() directly in user's code
+    ggmlhexagon_load_cfg();
+
+    if (nullptr == qnn_lib_path)
+        return nullptr;
+
+    GGMLHEXAGON_LOG_DEBUG("device %d", device);
+    GGMLHEXAGON_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path);
+    if (device >= GGML_HEXAGON_MAX_DEVICES) {
+        GGMLHEXAGON_LOG_ERROR("invalid device %d", device);
+        return nullptr;
+    }
+
+#if defined(__ANDROID__)
+    std::string path = qnn_lib_path;
+    GGMLHEXAGON_LOG_DEBUG("lib_path %s", path.c_str());
+    ggmlhexagon_set_runtime_path(device, path);
+#endif
+
+    if (nullptr != g_hexagon_mgr[device].backend) {
+        GGMLHEXAGON_LOG_DEBUG("backend %d(%s) already loaded", device,
+                         ggml_backend_hexagon_get_devname(device));
+        GGMLHEXAGON_LOG_DEBUG("leave %s", __func__);
+        return g_hexagon_mgr[device].backend;
+    }
+
+    //don't initialize QNN when hwaccel approach is offload ggml op to Hexagon cDSP directly
+    if (HWACCEL_CDSP != g_hexagon_appcfg.hwaccel_approach) {
+        qnn_instance * instance = ggmlqnn_init_qnn_instance(device, qnn_lib_path);
+        if (nullptr == instance)
+            return nullptr;
+    }
+
+    ggml_backend_hexagon_interface.graph_compute = ggmlhexagon_backend_graph_compute_general;
+
+    ggml_backend_t hexagon_backend = new ggml_backend{
+            /* .guid      = */ ggml_backend_hexagon_guid(),
+            /* .iface     = */ ggml_backend_hexagon_interface,
+            /* .device    = */ ggml_backend_reg_dev_get(ggml_backend_hexagon_reg(), device),
+            /* .context   = */ &g_hexagon_mgr[device]
+    };
+
+    g_hexagon_mgr[device].backend = hexagon_backend;
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        int result = ggmlhexagon_init_dsp(&g_hexagon_mgr[device]);
+        if (0 != result) {
+            GGMLHEXAGON_LOG_INFO("init hexagon dsp failure");
+            ggml_backend_hexagon_free(hexagon_backend);
+            return nullptr;
+        }
+        //ensure test-backend-ops get the correct backend name when inference approach is 1(HWACCEL_CDSP)
+        memcpy(g_hexagon_mgr[device].name, "Hexagon-cDSP", strlen("Hexagon-cDSP"));
+    } else {
+        //got fully description of SoC when hwaccel approach is HWACCEL_QNN and backend is HEXAGON_BACKEND_QNNNPU
+        GGMLHEXAGON_LOG_INFO("device name %s", ggml_backend_hexagon_device_get_description(hexagon_backend->device));
+    }
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__);
+
+    return hexagon_backend;
 }
+
+GGML_BACKEND_DL_IMPL(ggml_backend_hexagon_reg)
diff --git a/ggml/src/ggml-qnn/kernels/Makefile b/ggml/src/ggml-hexagon/kernels/Makefile
similarity index 100%
rename from ggml/src/ggml-qnn/kernels/Makefile
rename to ggml/src/ggml-hexagon/kernels/Makefile
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
similarity index 100%
rename from ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.c
rename to ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h
similarity index 100%
rename from ggml/src/ggml-qnn/kernels/ggmlop_ap_skel.h
rename to ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
similarity index 100%
rename from ggml/src/ggml-qnn/kernels/ggmlop_cdsp.c
rename to ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
diff --git a/ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c
similarity index 100%
rename from ggml/src/ggml-qnn/kernels/ggmlop_cdsp_skel.c
rename to ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index ecce07af250cd..3f1069be4978a 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# build llama.cpp + ggml-qnn for Snapdragon mobile SoC equipped Android phone on Linux
+# build llama.cpp + ggml-hexagon for Snapdragon mobile SoC equipped Android phone on Linux
 
 set -e
 
@@ -32,6 +32,7 @@ function dump_vars()
 {
     echo -e "ANDROID_NDK:          ${ANDROID_NDK}"
     echo -e "QNN_SDK_PATH:         ${QNN_SDK_PATH}"
+    echo -e "HEXAGON_SDK_PATH:     ${HEXAGON_SDK_PATH}"
 }
 
 
@@ -46,6 +47,8 @@ function check_hexagon_sdk()
     if [ ! -d ${HEXAGON_SDK_PATH} ]; then
         echo -e "HEXAGON_SDK_PATH ${HEXAGON_SDK_PATH} not exist, pls install it accordingly...\n"
         exit 0
+    else
+        printf "Qualcomm Hexagon SDK already exist:${HEXAGON_SDK_PATH} \n\n"
     fi
 }
 
@@ -117,7 +120,7 @@ function check_and_download_ndk()
 
 function build_arm64
 {
-    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_QNN=ON -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION}
+    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_HEXAGON=ON -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION}
     cd out/android
     make -j16
     show_pwd
@@ -166,7 +169,7 @@ function update_qnn_libs()
 
 function update_qnn_cfg()
 {
-    adb push ./scripts/ggml-qnn.cfg ${REMOTE_PATH}/
+    adb push ./scripts/ggml-hexagon.cfg ${REMOTE_PATH}/
 }
 
 
@@ -197,7 +200,7 @@ function prepare_run_on_phone()
     fi
     adb push ./out/android/bin/${program} ${REMOTE_PATH}/
     adb shell chmod +x ${REMOTE_PATH}/${program}
-    adb push ggml/src/ggml-qnn/kernels/libggmlop_skel${HTP_ARCH_VERSION}.so  ${REMOTE_PATH}/libggmlop_skel.so
+    adb push ggml/src/ggml-hexagon/kernels/libggmlop_skel${HTP_ARCH_VERSION}.so  ${REMOTE_PATH}/libggmlop_skel.so
 }
 
 function run_llamacli()
@@ -236,31 +239,14 @@ function run_test-op()
 {
     prepare_run_on_phone test-backend-ops
 
-    qnnbackendname=qnn-cpu
-    case $qnnbackend in
-        0)
-        qnnbackendname=qnn-cpu
-        ;;
-        1)
-        qnnbackendname=qnn-gpu
-        ;;
-        2)
-        qnnbackendname=qnn-npu
-        ;;
-        *)
-        qnnbackendname=qnn-cpu
-        ;;
-    esac
-
-    #debug
     echo "adb shell cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/test-backend-ops test -o $opname -b $qnnbackendname "
+               && ${REMOTE_PATH}/test-backend-ops test -o $opname "
 
     echo "\n"
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/test-backend-ops test -o $opname -b $qnnbackendname "
+               && ${REMOTE_PATH}/test-backend-ops test -o $opname "
 
 }
 
@@ -353,7 +339,7 @@ function show_usage()
     echo "  $0 build"
     echo "  $0 updateqnnlib"
     echo "  $0 run_testops"
-    echo "  $0 run_testop          [ADD/MUL_MAT]  [0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU)]"
+    echo "  $0 run_testop          [ADD/MUL_MAT]"
     echo "  $0 run_llamacli"
     echo "  $0 run_llamabench"
 
@@ -399,17 +385,11 @@ elif [ $# == 1 ]; then
         show_usage
         exit 1
     fi
-elif [ $# == 3 ]; then
+elif [ $# == 2 ]; then
     opname=$2
 #TODO: check opname in oplist
 #opname can be found via print_oplist:
 
-    qnnbackend=$3
-    if [ ${qnnbackend} -gt 3 ]; then
-        show_usage
-        exit 1
-    fi
-
     run_test-op
     exit 0
 else
diff --git a/scripts/ggml-qnn.cfg b/scripts/ggml-hexagon.cfg
similarity index 86%
rename from scripts/ggml-qnn.cfg
rename to scripts/ggml-hexagon.cfg
index 7689e1dced161..75f3ab0d9d933 100644
--- a/scripts/ggml-qnn.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -1,9 +1,9 @@
 [general]
-#0: QNN-CPU backend
-#1: QNN-GPU backend
-#2: QNN-NPU backend / Hexagon cDSP
+#0: HEXAGON_BACKEND_QNNCPU
+#1: HEXAGON_BACKEND_QNNGPU
+#2: HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP
 #3: default ggml backend
-qnn_backend = 2
+hexagon_backend = 2
 
 # enable/disable QNN's internal log
 print_qnn_internal_log = 0
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
index 9aa1783202d67..d70acb7719435 100644
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -4683,11 +4683,7 @@ int main(int argc, char ** argv) {
             continue;
         }
 
-#ifdef GGML_USE_QNN
-        ggml_backend_t backend = ggml_backend_dev_init(dev, reinterpret_cast<const char *>(i));
-#else
         ggml_backend_t backend = ggml_backend_dev_init(dev, NULL);
-#endif
         GGML_ASSERT(backend != NULL);
 
         ggml_backend_reg_t reg = ggml_backend_dev_backend_reg(dev);

From c53f736131375e8a6afc777cbd35707d69ede905 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sat, 29 Mar 2025 19:40:51 +0800
Subject: [PATCH 143/200] ggml-hexagon: release ggml-hexagon v0.99

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 131 +++++++++++++++----------
 1 file changed, 77 insertions(+), 54 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index b52bf5a0418ce..8ca5e0653e3d2 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -751,8 +751,10 @@ static void ggmlhexagon_log_internal(ggml_log_level level, const char * file, co
 static void ggmlhexagon_print_tensors_info(const char * func_name, const ggml_backend_hexagon_context * ctx,
                 const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * dst) {
     //skip sanity check of params because of performance concern
-    if (0 == g_hexagon_appcfg.print_tensors_info)
-        return;
+    if (0 == g_hexagon_appcfg.dump_op_info) {
+        if (0 == g_hexagon_appcfg.print_tensors_info)
+            return;
+    }
 
     if (nullptr != func_name && nullptr != ctx) {
         GGMLHEXAGON_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
@@ -862,13 +864,17 @@ static void ggmlhexagon_get_timestring(char * p_currenttime) {
 }
 
 static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * ctx) {
-    GGMLHEXAGON_LOG_INFO("hwaccel approach is %d(%s)", g_hexagon_appcfg.hwaccel_approach,
-                     ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
     char timestamp[GGMLHEXAGON_TMPBUF_LEN];
     memset(timestamp, 0, GGMLHEXAGON_TMPBUF_LEN);
+
+    GGMLHEXAGON_LOG_INFO("hwaccel approach is %d(%s)", g_hexagon_appcfg.hwaccel_approach,
+                     ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
+    GGMLHEXAGON_LOG_INFO("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend,
+                         ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend));
     ggmlhexagon_get_timestring(timestamp);
     if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
-        GGMLHEXAGON_LOG_INFO("only offload GGML_OP_ADD : %s", g_hexagon_appcfg.enable_q_mulmat ? "NO" : "YES");
+        GGMLHEXAGON_LOG_INFO("only offload GGML_OP_ADD: %s", g_hexagon_appcfg.enable_mulmat_cdsp ? "NO" : "YES");
+        GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
     } else {
         GGMLHEXAGON_LOG_INFO("only offload GGML_OP_ADD: NO");
     }
@@ -1437,7 +1443,7 @@ static void ggmlhexagon_load_cfg() {
     qnncfg_instance.load(cfg_filename);
     qnncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
         std::ostringstream  tmposs;
-        tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]" << std::endl;
+        tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]";
         GGMLHEXAGON_LOG_INFO("%s", tmposs.str().c_str());
     });
     std::string precision_mode;
@@ -1453,11 +1459,10 @@ static void ggmlhexagon_load_cfg() {
     qnncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32");
     qnncfg_instance.get_intvalue("cdsp", "enable_mulmat_cdsp", g_hexagon_appcfg.enable_mulmat_cdsp, 0);
     qnncfg_instance.get_intvalue("cdsp", "enable_q_mulmat", g_hexagon_appcfg.enable_q_mulmat, 0);
-    GGMLHEXAGON_LOG_INFO("print_qnn_internal_log=%d", g_hexagon_appcfg.print_qnn_internal_log);
     GGMLHEXAGON_LOG_INFO("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
                          ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
-    GGMLHEXAGON_LOG_INFO("hexagon_backend=%d", g_hexagon_appcfg.hexagon_backend);
-    GGMLHEXAGON_LOG_INFO("npu inference precision mode=%s", precision_mode.c_str());
+    GGMLHEXAGON_LOG_INFO("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend,
+                         ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend));
     GGMLHEXAGON_LOG_INFO("qnn runtime lib path=%s", g_hexagon_appcfg.runtimelib_path);
     if (precision_mode.find("fp16") != std::string::npos) {
         g_hexagon_appcfg.precision_mode = 1;
@@ -4853,7 +4858,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     ggmlop_domain_uri_len   = strlen(ggmlop_URI) + MAX_DOMAIN_NAMELEN;
     ggmlop_domain_uri       = (char *)malloc(ggmlop_domain_uri_len);
     snprintf(ggmlop_domain_uri, ggmlop_domain_uri_len, "%s%s", ggmlop_URI, uri);
-    GGMLHEXAGON_LOG_INFO("ggmlop domain uri:%s", ggmlop_domain_uri);
+    GGMLHEXAGON_LOG_DEBUG("ggmlop domain uri:%s", ggmlop_domain_uri);
     hexagon_error = ggmlop_dsp_open(ggmlop_domain_uri, &ctx->ggmlop_handle);
     if (AEE_SUCCESS == hexagon_error) {
         GGMLHEXAGON_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
@@ -4976,9 +4981,6 @@ static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_
     dsptensor_2.nb[2] = dst->nb[2];
     dsptensor_2.nb[3] = dst->nb[3];
 
-    //GGMLQNN_DUMP_DSPTENSOR(&dsptensor_0);
-    //GGMLQNN_DUMP_DSPTENSOR(&dsptensor_1);
-    //GGMLQNN_DUMP_DSPTENSOR(&dsptensor_2);
     hexagon_error = op_func(ctx->ggmlop_handle, &dsptensor_0, &dsptensor_1, &dsptensor_2);
     if (AEE_SUCCESS != hexagon_error) {
         GGMLHEXAGON_LOG_WARN("ggmlop %s computation fail on cdsp", ggml_op_name(op->op));
@@ -4991,32 +4993,30 @@ static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_
 // =================================================================================================
 //  section-8: implementation of ggml-hexagon backend according to specification in ggml backend subsystem
 // =================================================================================================
-//hwaccel through cDSP
-static bool ggmlhexagon_can_handle_op(const ggml_backend_hexagon_context * ctx, const struct ggml_tensor * op_tensor) {
-    ggmlhexagon_dump_op_info(op_tensor);
+static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const struct ggml_tensor * op_tensor) {
+    ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)dev->context;
+    GGML_UNUSED(ctx);
+    if (op_tensor->op == GGML_OP_NONE) {
+        return true;
+    }
+
     if (!ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op_tensor)].supported) {
         return false;
     }
 
-    struct ggml_tensor * src0 = op_tensor->src[0];
-    struct ggml_tensor * src1 = op_tensor->src[1];
-    const int64_t ne00  = op_tensor->src[0]->ne[0];
-    uint32_t src0_rank  = ggml_n_dims(src0);
+    const struct ggml_tensor * src0 = op_tensor->src[0];
+    const struct ggml_tensor * src1 = op_tensor->src[1];
+    int64_t ne00        = 0;
+    uint32_t src0_rank  = 0;
     uint32_t src1_rank  = 0;
+    if (nullptr != src0) {
+        src0_rank = ggml_n_dims(src0);
+        ne00      = src0->ne[0];
+    }
     if (nullptr != src1) {
         src1_rank = ggml_n_dims(src1);
     }
 
-    //available in the early stage, should be removed in the product stage
-    bool support = false;
-    if (g_hexagon_appcfg.enable_mulmat_cdsp)
-        support = ((op_tensor->op == GGML_OP_ADD) || (op_tensor->op == GGML_OP_MUL_MAT));
-    else
-        support = (op_tensor->op == GGML_OP_ADD);
-    if (!support) {
-        return false;
-    }
-
     switch (op_tensor->op) {
         case GGML_OP_ADD:
         case GGML_OP_SUB:
@@ -5024,7 +5024,13 @@ static bool ggmlhexagon_can_handle_op(const ggml_backend_hexagon_context * ctx,
             if (!ggml_are_same_shape(src0, src1)) {
                 return false;
             }
-            break;
+
+            //FIXME:remove this filter
+            if (ne00 < 32)
+                return false;
+            
+            //FIXME:remove this filter
+            return ggmlhexagon_same_types(ctx, op_tensor);
         }
         case GGML_OP_MUL_MAT:
         {
@@ -5034,6 +5040,7 @@ static bool ggmlhexagon_can_handle_op(const ggml_backend_hexagon_context * ctx,
             if (src0_rank != 2)
                 return false;
 
+            ggmlhexagon_dump_op_info(op_tensor);
             if (g_hexagon_appcfg.enable_q_mulmat)
                 return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type))
                        && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
@@ -5043,27 +5050,28 @@ static bool ggmlhexagon_can_handle_op(const ggml_backend_hexagon_context * ctx,
         default:
             break;
     }
-    return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+    return false;
 }
 
-static bool ggmlbackend_can_handle_op(const ggml_backend_hexagon_context * ctx, const struct ggml_tensor * op_tensor) {
+static bool ggmlhexagon_can_handle_op_through_qnn(ggml_backend_dev_t dev, const struct ggml_tensor * op_tensor) {
+    ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)dev->context;
     if (op_tensor->op == GGML_OP_NONE) {
         return true;
     }
 
-    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
-        return ggmlhexagon_can_handle_op(ctx, op_tensor);
-    }
-
     if (!ggmlqnn_k_op_caps[ggmlhexagon_get_op_index(op_tensor)].supported) {
         return false;
     }
 
     struct ggml_tensor * src0 = op_tensor->src[0];
     struct ggml_tensor * src1 = op_tensor->src[1];
-    const int64_t ne00  = op_tensor->src[0]->ne[0];
-    uint32_t src0_rank  = ggml_n_dims(src0);
+    int64_t ne00        = 0;
+    uint32_t src0_rank  = 0;
     uint32_t src1_rank  = 0;
+    if (nullptr != src0) {
+        src0_rank = ggml_n_dims(src0);
+        ne00      = src0->ne[0];
+    }
     if (nullptr != src1) {
         src1_rank = ggml_n_dims(src1);
     }
@@ -5542,6 +5550,11 @@ static ggml_backend_t ggml_backend_hexagon_device_init_backend(ggml_backend_dev_
     GGMLHEXAGON_LOG_DEBUG("user's specified hexagon_backend in cfgfile = %d", g_hexagon_appcfg.hexagon_backend);
     GGMLHEXAGON_LOG_DEBUG("user's sepcified qnn runtime lib path in cfgfile = %s", g_hexagon_appcfg.runtimelib_path);
 
+    if (HWACCEL_QNN_SINGLEGRAPH == g_hexagon_appcfg.hwaccel_approach) {
+        GGMLHEXAGON_LOG_INFO("HWACCEL_QNN_SINGLEGRAPH not supported, using default ggml backend");
+        return nullptr;
+    }
+
     if (nullptr == params) {
         GGMLHEXAGON_LOG_DEBUG("program specified param is nullptr");
         dev_index = (g_hexagon_appcfg.hexagon_backend > 0) ? g_hexagon_appcfg.hexagon_backend : 0;
@@ -5600,11 +5613,6 @@ static ggml_backend_buffer_t ggml_backend_hexagon_device_buffer_from_host_ptr(gg
     GGML_UNUSED(max_tensor_size);
 }
 
-static bool ggml_backend_hexagon_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
-    ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *) dev->context;
-    return (ggmlbackend_can_handle_op(ctx,op));
-}
-
 static bool ggml_backend_hexagon_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
     GGML_UNUSED(dev);
     return ggml_backend_buft_is_host(buft);
@@ -5620,7 +5628,7 @@ static struct ggml_backend_device_i ggml_backend_hexagon_device_interface = {
         /* .get_buffer_type      = */ ggml_backend_hexagon_device_get_buffer_type,
         /* .get_host_buffer_type = */ nullptr,
         /* .buffer_from_host_ptr = */ ggml_backend_hexagon_device_buffer_from_host_ptr,
-        /* .supports_op          = */ ggml_backend_hexagon_device_supports_op,
+        /* .supports_op          = */ nullptr,
         /* .supports_buft        = */ ggml_backend_hexagon_device_supports_buft,
         /* .offload_op           = */ nullptr,
         /* .event_new            = */ nullptr,
@@ -5719,16 +5727,21 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
 
     //case-2: normal scenario, such as llama-cli or UI applicaton
     ggmlhexagon_load_cfg();
-    GGMLHEXAGON_LOG_INFO("hwaccel approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
+    GGMLHEXAGON_LOG_DEBUG("hwaccel approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
                      ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
-    GGMLHEXAGON_LOG_INFO("user's specified hexagon_backend=%d", g_hexagon_appcfg.hexagon_backend);
-    GGMLHEXAGON_LOG_INFO("user's specified runtime lib path=%s", g_hexagon_appcfg.runtimelib_path);
+    GGMLHEXAGON_LOG_DEBUG("user's specified hexagon_backend=%d", g_hexagon_appcfg.hexagon_backend);
+    GGMLHEXAGON_LOG_DEBUG("user's specified runtime lib path=%s", g_hexagon_appcfg.runtimelib_path);
     if (g_hexagon_appcfg.hexagon_backend >= GGML_HEXAGON_MAX_DEVICES) {
-        GGMLHEXAGON_LOG_INFO("assume default ggml backend");
+        GGMLHEXAGON_LOG_INFO("using default ggml backend");
         GGMLHEXAGON_LOG_DEBUG("leave ggml_backend_hexagon_reg");
         return nullptr;
     }
 
+    if (HWACCEL_QNN_SINGLEGRAPH == g_hexagon_appcfg.hwaccel_approach) {
+        GGMLHEXAGON_LOG_INFO("HWACCEL_QNN_SINGLEGRAPH not supported, using default ggml backend");
+        return nullptr;
+    }
+
     {
         static std::mutex mutex;
         std::lock_guard<std::mutex> lock(mutex);
@@ -5736,6 +5749,11 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
             ggml_backend_hexagon_reg_context * ctx = new ggml_backend_hexagon_reg_context;
 
             for (int i = 0; i < ggml_backend_hexagon_get_device_count(); i++) {
+                if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
+                    ggml_backend_hexagon_device_interface.supports_op = ggmlhexagon_can_handle_op_through_cdsp;
+                } else {
+                    ggml_backend_hexagon_device_interface.supports_op = ggmlhexagon_can_handle_op_through_qnn;
+                }
                 ggml_backend_dev_t dev = new ggml_backend_device {
                         /* .iface       = */ ggml_backend_hexagon_device_interface,
                         /* .reg         = */ &reg,
@@ -5763,18 +5781,18 @@ const char * ggml_backend_hexagon_get_devname(size_t dev_num) {
         if (dev_num == HEXAGON_BACKEND_GGML)
             return "ggml";
         else
-            return "ggml-hexagon";
+            return "HEXAGON_BACKEND_CDSP";
     }
 
     switch (dev_num) {
         case HEXAGON_BACKEND_QNNCPU:
-            return "QNN-CPU";
+            return "HEXAGON_BACKEND_QNN_CPU";
         case HEXAGON_BACKEND_QNNGPU:
-            return "QNN-GPU";
+            return "HEXAGON_BACKEND_QNN_GPU";
         case HEXAGON_BACKEND_QNNNPU:
-            return "QNN-NPU";
+            return "HEXAGON_BACKEND_QNN_NPU";
         case HEXAGON_BACKEND_GGML:
-            return "ggml"; //"fake" QNN backend, used for compare performance between QNN backend and original GGML
+            return "ggml"; //"fake" QNN backend, used for compare performance between hexagon backend and the default ggml backend
         default:
             return "unknown";
     }
@@ -5826,6 +5844,11 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * qnn_lib_pat
     if (nullptr == qnn_lib_path)
         return nullptr;
 
+    if (HWACCEL_QNN_SINGLEGRAPH == g_hexagon_appcfg.hwaccel_approach) {
+        GGMLHEXAGON_LOG_INFO("HWACCEL_QNN_SINGLEGRAPH not supported, using default ggml backend");
+        return nullptr;
+    }
+
     GGMLHEXAGON_LOG_DEBUG("device %d", device);
     GGMLHEXAGON_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path);
     if (device >= GGML_HEXAGON_MAX_DEVICES) {

From 7c13b05994e8ad81fa16eba41eb3e6a94213ec44 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sat, 29 Mar 2025 21:35:27 +0800
Subject: [PATCH 144/200] ggml-hexagon: try to offload q6_k mulmat to cDSP

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp      |    6 +-
 ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c | 1328 +++++++++++++++----
 scripts/ggml-hexagon.cfg                    |    2 +-
 3 files changed, 1047 insertions(+), 289 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 8ca5e0653e3d2..85e0e4e1e3388 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -5028,7 +5028,7 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
             //FIXME:remove this filter
             if (ne00 < 32)
                 return false;
-            
+
             //FIXME:remove this filter
             return ggmlhexagon_same_types(ctx, op_tensor);
         }
@@ -5042,8 +5042,8 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
 
             ggmlhexagon_dump_op_info(op_tensor);
             if (g_hexagon_appcfg.enable_q_mulmat)
-                return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type))
-                       && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+                return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q6_K
+                       ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
             else
                 return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
         }
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
index 410422f2f6d1e..c0f04c4935c4d 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
@@ -70,7 +70,17 @@
 
 #define static_assert(a, b) do { } while (0)
 
-typedef double      ggml_float;
+#define GROUP_MAX_EPS 1e-15f
+
+// QK = number of values after dequantization
+// QK_K = super-block size
+#define QK_K 256
+#define K_SCALE_SIZE 12
+
+#define GGML_COMPUTE_FP16_TO_FP32(x)    ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x)    ggml_compute_fp32_to_fp16(x)
+#define GGML_FP32_TO_FP16(x)            GGML_COMPUTE_FP32_TO_FP16(x)
+#define GGML_FP16_TO_FP32(x)            ggml_lookup_fp16_to_fp32(x)
 
 #if 0//def NDEBUG
 #define GGMLQNN_DEBUG                                       0
@@ -175,16 +185,158 @@ enum ggml_type {
     GGML_TYPE_COUNT   = 39,
 };
 
-static size_t ggml_nbytes(const struct ggml_tensor * tensor);
-static void   ggmlhexagon_log_internal(int level, const char * file, const char * func, int line, const char * format, ...);
-static void   ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
-
-typedef void  (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
-                                 const void * GGML_RESTRICT y, size_t by, int nrc);
-typedef void  (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
+typedef double      ggml_float;
+typedef uint16_t    ggml_fp16_t;
+typedef uint16_t    ggml_half;
+typedef uint32_t    ggml_half2;
+typedef void        (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
+                                        const void * GGML_RESTRICT y, size_t by, int nrc);
+typedef void        (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
+
+typedef void        (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+typedef void        (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
+
+struct ggml_compute_params {
+    // ith = thread index, nth = number of threads
+    int ith, nth;
+
+    // work buffer for all threads
+    size_t wsize;
+    void * wdata;
+};
 
-typedef void  (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-typedef void  (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
+#define QK4_0 32
+typedef struct {
+    ggml_half d;           // delta
+    uint8_t qs[QK4_0 / 2]; // nibbles / quants
+} block_q4_0;
+
+#define QK4_1 32
+typedef struct {
+    union {
+        struct {
+            ggml_half d; // delta
+            ggml_half m; // min
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+    uint8_t qs[QK4_1 / 2]; // nibbles / quants
+} block_q4_1;
+
+#define QK5_0 32
+typedef struct {
+    ggml_half d;           // delta
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2]; // nibbles / quants
+} block_q5_0;
+
+#define QK5_1 32
+typedef struct {
+    union {
+        struct {
+            ggml_half d; // delta
+            ggml_half m; // min
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2]; // nibbles / quants
+} block_q5_1;
+
+#define QK8_0 32
+typedef struct {
+    ggml_half d;       // delta
+    int8_t  qs[QK8_0]; // quants
+} block_q8_0;
+
+#define QK8_1 32
+typedef struct {
+    union {
+        struct {
+            ggml_half d; // delta
+            ggml_half s; // d * sum(qs[i])
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 ds;
+    } GGML_COMMON_AGGR_U;
+    int8_t qs[QK8_1]; // quants
+} block_q8_1;
+
+// 2-bit quantization
+// weight is represented as x = a * q + b
+// 16 blocks of 16 elements each
+// Effectively 2.625 bits per weight
+typedef struct {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    union {
+        struct {
+            ggml_half d;    // super-block scale for quantized scales
+            ggml_half dmin; // super-block scale for quantized mins
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+} block_q2_K;
+
+// 3-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 3.4375 bits per weight
+typedef struct {
+    uint8_t hmask[QK_K/8]; // quants - high bit
+    uint8_t qs[QK_K/4];    // quants - low 2 bits
+    uint8_t scales[12];    // scales, quantized with 6 bits
+    ggml_half d;           // super-block scale
+} block_q3_K;
+
+// 4-bit quantization
+// 8 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 4.5 bits per weight
+typedef struct {
+    union {
+        struct {
+            ggml_half d;    // super-block scale for quantized scales
+            ggml_half dmin; // super-block scale for quantized mins
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];           // 4--bit quants
+} block_q4_K;
+
+// 5-bit quantization
+// 8 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 5.5 bits per weight
+typedef struct {
+    union {
+        struct {
+            ggml_half d;    // super-block scale for quantized scales
+            ggml_half dmin; // super-block scale for quantized mins
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];           // quants, high bit
+    uint8_t qs[QK_K/2];           // quants, low 4 bits
+} block_q5_K;
+
+// 6-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 6.5625 bits per weight
+typedef struct {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    ggml_half d;             // super-block scale
+} block_q6_K;
+
+typedef struct {
+    float   d;              // delta
+    int8_t  qs[QK_K];       // quants
+    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
+} block_q8_K;
 
 struct ggml_type_traits {
     const char             * type_name;
@@ -203,21 +355,252 @@ struct ggml_type_traits_cpu {
     int64_t                  nrows; // number of rows to process simultaneously
 };
 
-static const struct ggml_type_traits_cpu type_traits_cpu[1] = {
+static size_t ggml_nbytes(const struct ggml_tensor * tensor);
+static void   ggmlhexagon_log_internal(int level, const char * file, const char * func, int line, const char * format, ...);
+static void   ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
+
+static void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+static void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
+static void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+static void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+
+static float ggml_table_f32_f16[1 << 16];
+
+static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         [GGML_TYPE_F32] = {
                 .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
                 .vec_dot_type             = GGML_TYPE_F32,
                 .nrows                    = 1,
         },
+        [GGML_TYPE_F16] = {
+                .from_float               = NULL,
+                .vec_dot                  = NULL,
+                .vec_dot_type             = GGML_TYPE_F16,
+                .nrows                    = 1,
+        },
+        [GGML_TYPE_Q4_0] = {
+                .from_float               = NULL,
+                .vec_dot                  = NULL,
+                .vec_dot_type             = GGML_TYPE_Q8_0,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+                .nrows                    = 2,
+#else
+                .nrows                    = 1,
+#endif
+        },
+        [GGML_TYPE_Q4_1] = {
+                .from_float               = NULL,
+                .vec_dot                  = NULL,
+                .vec_dot_type             = GGML_TYPE_Q8_1,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+                .nrows                    = 2,
+#else
+                .nrows                    = 1,
+#endif
+        },
+        [GGML_TYPE_Q5_0] = {
+                .from_float               = NULL,
+                .vec_dot                  = NULL,
+                .vec_dot_type             = GGML_TYPE_Q8_0,
+                .nrows                    = 1,
+        },
+        [GGML_TYPE_Q5_1] = {
+                .from_float               = NULL,
+                .vec_dot                  = NULL,
+                .vec_dot_type             = GGML_TYPE_Q8_1,
+                .nrows                    = 1,
+        },
+        [GGML_TYPE_Q8_0] = {
+                .from_float               = NULL,
+                .vec_dot                  = NULL,
+                .vec_dot_type             = GGML_TYPE_Q8_0,
+#if defined (__ARM_FEATURE_MATMUL_INT8)
+                .nrows                    = 2,
+#else
+                .nrows                    = 1,
+#endif
+        },
+        [GGML_TYPE_Q8_1] = {
+                .from_float               = NULL,
+                .vec_dot_type             = GGML_TYPE_Q8_1,
+                .nrows                    = 1,
+        },
+        [GGML_TYPE_Q2_K] = {
+                .from_float               = NULL,
+                .vec_dot                  = NULL,
+                .vec_dot_type             = GGML_TYPE_Q8_K,
+                .nrows                    = 1,
+        },
+        [GGML_TYPE_Q3_K] = {
+                .from_float               = NULL,
+                .vec_dot                  = NULL,
+                .vec_dot_type             = GGML_TYPE_Q8_K,
+                .nrows                    = 1,
+        },
+        [GGML_TYPE_Q4_K] = {
+                .from_float               = NULL,
+                .vec_dot                  = NULL,
+                .vec_dot_type             = GGML_TYPE_Q8_K,
+                .nrows                    = 1,
+        },
+        [GGML_TYPE_Q5_K] = {
+                .from_float               = NULL,
+                .vec_dot                  = NULL,
+                .vec_dot_type             = GGML_TYPE_Q8_K,
+                .nrows                    = 1,
+        },
+        [GGML_TYPE_Q6_K] = {
+                .from_float               = quantize_row_q6_K,
+                .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
+                .vec_dot_type             = GGML_TYPE_Q8_K,
+                .nrows                    = 1,
+        },
 };
 
-static const struct ggml_type_traits type_traits[1] = {
+static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
+        [GGML_TYPE_I8] = {
+                .type_name                = "i8",
+                .blck_size                = 1,
+                .type_size                = sizeof(int8_t),
+                .is_quantized             = false,
+        },
+        [GGML_TYPE_I16] = {
+                .type_name                = "i16",
+                .blck_size                = 1,
+                .type_size                = sizeof(int16_t),
+                .is_quantized             = false,
+        },
+        [GGML_TYPE_I32] = {
+                .type_name                = "i32",
+                .blck_size                = 1,
+                .type_size                = sizeof(int32_t),
+                .is_quantized             = false,
+        },
+        [GGML_TYPE_I64] = {
+                .type_name                = "i64",
+                .blck_size                = 1,
+                .type_size                = sizeof(int64_t),
+                .is_quantized             = false,
+        },
+        [GGML_TYPE_F64] = {
+                .type_name                = "f64",
+                .blck_size                = 1,
+                .type_size                = sizeof(double),
+                .is_quantized             = false,
+        },
         [GGML_TYPE_F32] = {
                 .type_name                = "f32",
                 .blck_size                = 1,
                 .type_size                = sizeof(float),
                 .is_quantized             = false,
         },
+        [GGML_TYPE_F16] = {
+                .type_name                = "f16",
+                .blck_size                = 1,
+                .type_size                = sizeof(ggml_fp16_t),
+                .is_quantized             = false,
+                .to_float                 = NULL,
+                .from_float_ref           = NULL,
+        },
+        [GGML_TYPE_Q4_0] = {
+                .type_name                = "q4_0",
+                .blck_size                = QK4_0,
+                .type_size                = sizeof(block_q4_0),
+                .is_quantized             = true,
+                .to_float                 = NULL,
+                .from_float_ref           = NULL,
+        },
+        [GGML_TYPE_Q4_1] = {
+                .type_name                = "q4_1",
+                .blck_size                = QK4_1,
+                .type_size                = sizeof(block_q4_1),
+                .is_quantized             = true,
+                .to_float                 = NULL,
+                .from_float_ref           = NULL,
+        },
+        [4] = { // GGML_TYPE_Q4_2
+                .type_name                = "DEPRECATED",
+                .blck_size                = 0,
+                .type_size                = 0,
+                .is_quantized             = false,
+        },
+        [5] = { // GGML_TYPE_Q4_3
+                .type_name                = "DEPRECATED",
+                .blck_size                = 0,
+                .type_size                = 0,
+                .is_quantized             = false,
+        },
+        [GGML_TYPE_Q5_0] = {
+                .type_name                = "q5_0",
+                .blck_size                = QK5_0,
+                .type_size                = sizeof(block_q5_0),
+                .is_quantized             = true,
+                .to_float                 = NULL,
+                .from_float_ref           = NULL,
+        },
+        [GGML_TYPE_Q5_1] = {
+                .type_name                = "q5_1",
+                .blck_size                = QK5_1,
+                .type_size                = sizeof(block_q5_1),
+                .is_quantized             = true,
+                .to_float                 = NULL,
+                .from_float_ref           = NULL,
+        },
+        [GGML_TYPE_Q8_0] = {
+                .type_name                = "q8_0",
+                .blck_size                = QK8_0,
+                .type_size                = sizeof(block_q8_0),
+                .is_quantized             = true,
+                .to_float                 = NULL,
+                .from_float_ref           = NULL,
+        },
+        [GGML_TYPE_Q8_1] = {
+                .type_name                = "q8_1",
+                .blck_size                = QK8_1,
+                .type_size                = sizeof(block_q8_1),
+                .is_quantized             = true,
+                .from_float_ref           = NULL,
+        },
+        [GGML_TYPE_Q2_K] = {
+                .type_name                = "q2_K",
+                .blck_size                = QK_K,
+                .type_size                = sizeof(block_q2_K),
+                .is_quantized             = true,
+                .to_float                 = NULL,
+                .from_float_ref           = NULL,
+        },
+        [GGML_TYPE_Q3_K] = {
+                .type_name                = "q3_K",
+                .blck_size                = QK_K,
+                .type_size                = sizeof(block_q3_K),
+                .is_quantized             = true,
+                .to_float                 = NULL,
+                .from_float_ref           = NULL,
+        },
+        [GGML_TYPE_Q4_K] = {
+                .type_name                = "q4_K",
+                .blck_size                = QK_K,
+                .type_size                = sizeof(block_q4_K),
+                .is_quantized             = true,
+                .to_float                 = NULL,
+                .from_float_ref           = NULL,
+        },
+        [GGML_TYPE_Q5_K] = {
+                .type_name                = "q5_K",
+                .blck_size                = QK_K,
+                .type_size                = sizeof(block_q5_K),
+                .is_quantized             = true,
+                .to_float                 = NULL,
+                .from_float_ref           = NULL,
+        },
+        [GGML_TYPE_Q6_K] = {
+                .type_name                = "q6_K",
+                .blck_size                = QK_K,
+                .type_size                = sizeof(block_q6_K),
+                .is_quantized             = true,
+                .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
+                .from_float_ref           = (ggml_from_float_t) quantize_row_q6_K_ref,
+        },
 
 };
 
@@ -419,6 +802,12 @@ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
     return true;
 }
 
+static int64_t ggml_nelements(const struct ggml_tensor * tensor) {
+    static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
+
+    return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
+}
+
 static bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
     return ggml_is_contiguous_n(tensor, 0);
 }
@@ -437,6 +826,329 @@ static void ggml_abort(const char * file, int line, const char * fmt, ...) {
     return;
 }
 
+// FP16 <-> FP32
+static inline float fp32_from_bits(uint32_t w) {
+    union {
+        uint32_t as_bits;
+        float as_value;
+    } fp32;
+    fp32.as_bits = w;
+    return fp32.as_value;
+}
+
+static inline uint32_t fp32_to_bits(float f) {
+    union {
+        float as_value;
+        uint32_t as_bits;
+    } fp32;
+    fp32.as_value = f;
+    return fp32.as_bits;
+}
+
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+    const uint32_t w = (uint32_t) h << 16;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    const uint32_t two_w = w + w;
+
+    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
+    const float exp_scale = 0x1.0p-112f;
+#else
+    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+
+    const uint32_t magic_mask = UINT32_C(126) << 23;
+    const float magic_bias = 0.5f;
+    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+
+    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+    const uint32_t result = sign |
+                            (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+    return fp32_from_bits(result);
+}
+
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
+    const float scale_to_inf = 0x1.0p+112f;
+    const float scale_to_zero = 0x1.0p-110f;
+#else
+    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+        const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+#endif
+    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+    const uint32_t w = fp32_to_bits(f);
+    const uint32_t shl1_w = w + w;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+    if (bias < UINT32_C(0x71000000)) {
+        bias = UINT32_C(0x71000000);
+    }
+
+    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+    const uint32_t bits = fp32_to_bits(base);
+    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+    const uint32_t nonsign = exp_bits + mantissa_bits;
+    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+}
+
+inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+    uint16_t s;
+    memcpy(&s, &f, sizeof(uint16_t));
+    return ggml_table_f32_f16[s];
+}
+
+static void ggml_init() {
+    for (int i = 0; i < (1 << 16); ++i) {
+        union {
+            uint16_t u16;
+            ggml_fp16_t fp16;
+        } u = {i};
+        ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
+    }
+}
+
+static inline int nearest_int(float fval) {
+    assert(fabsf(fval) <= 4194303.f);
+    float val = fval + 12582912.f;
+    int i; memcpy(&i, &val, sizeof(int));
+    return (i & 0x007fffff) - 0x00400000;
+}
+
+static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
+                            const float * GGML_RESTRICT qw) {
+    float max = 0;
+    float amax = 0;
+    for (int i = 0; i < n; ++i) {
+        float ax = fabsf(x[i]);
+        if (ax > amax) { amax = ax; max = x[i]; }
+    }
+    if (amax < GROUP_MAX_EPS) { // all zero
+        for (int i = 0; i < n; ++i) {
+            L[i] = 0;
+        }
+        return 0.f;
+    }
+    float iscale = -nmax / max;
+    if (rmse_type == 0) {
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale * x[i]);
+            L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
+        }
+        return 1/iscale;
+    }
+    bool return_early = false;
+    if (rmse_type < 0) {
+        rmse_type = -rmse_type;
+        return_early = true;
+    }
+    float sumlx = 0;
+    float suml2 = 0;
+#ifdef HAVE_BUGGY_APPLE_LINKER
+    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
+    for (volatile int i = 0; i < n; ++i) {
+#else
+    for (int i = 0; i < n; ++i) {
+#endif
+        int l = nearest_int(iscale * x[i]);
+        l = MAX(-nmax, MIN(nmax-1, l));
+        L[i] = l + nmax;
+        float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
+        sumlx += w*x[i]*l;
+        suml2 += w*l*l;
+    }
+    float scale = suml2 ? sumlx/suml2 : 0.0f;
+    if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
+    float best = scale * sumlx;
+    for (int is = -9; is <= 9; ++is) {
+        if (is == 0) {
+            continue;
+        }
+        iscale = -(nmax + 0.1f*is) / max;
+        sumlx = suml2 = 0;
+        for (int i = 0; i < n; ++i) {
+            int l = nearest_int(iscale * x[i]);
+            l = MAX(-nmax, MIN(nmax-1, l));
+            float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
+            sumlx += w*x[i]*l;
+            suml2 += w*l*l;
+        }
+        if (suml2 > 0 && sumlx*sumlx > best*suml2) {
+            for (int i = 0; i < n; ++i) {
+                int l = nearest_int(iscale * x[i]);
+                L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
+            }
+            scale = sumlx/suml2; best = scale*sumlx;
+        }
+    }
+    return scale;
+}
+
+static void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    for (int i = 0; i < nb; i++) {
+        const float d = GGML_FP16_TO_FP32(x[i].d);
+
+        const uint8_t * GGML_RESTRICT ql = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const int8_t  * GGML_RESTRICT sc = x[i].scales;
+
+        for (int n = 0; n < QK_K; n += 128) {
+            for (int l = 0; l < 32; ++l) {
+                int is = l/16;
+                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+                y[l +  0] = d * sc[is + 0] * q1;
+                y[l + 32] = d * sc[is + 2] * q2;
+                y[l + 64] = d * sc[is + 4] * q3;
+                y[l + 96] = d * sc[is + 6] * q4;
+            }
+            y  += 128;
+            ql += 64;
+            qh += 32;
+            sc += 8;
+        }
+    }
+}
+
+static void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k) {
+    assert(k % QK_K == 0);
+    const int64_t nb = k / QK_K;
+
+    int8_t L[QK_K];
+    float   scales[QK_K/16];
+
+    for (int i = 0; i < nb; i++) {
+
+        float max_scale = 0;
+        float max_abs_scale = 0;
+
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+
+            const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
+            scales[ib] = scale;
+
+            const float abs_scale = fabsf(scale);
+            if (abs_scale > max_abs_scale) {
+                max_abs_scale = abs_scale;
+                max_scale = scale;
+            }
+
+        }
+
+        if (max_abs_scale < GROUP_MAX_EPS) {
+            memset(&y[i], 0, sizeof(block_q6_K));
+            y[i].d = GGML_FP32_TO_FP16(0.f);
+            x += QK_K;
+            continue;
+        }
+
+        float iscale = -128.f/max_scale;
+        y[i].d = GGML_FP32_TO_FP16(1/iscale);
+        for (int ib = 0; ib < QK_K/16; ++ib) {
+            y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
+        }
+
+        for (int j = 0; j < QK_K/16; ++j) {
+            float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
+            if (!d) {
+                continue;
+            }
+            for (int ii = 0; ii < 16; ++ii) {
+                int l = nearest_int(x[16*j + ii]/d);
+                l = MAX(-32, MIN(31, l));
+                L[16*j + ii] = l + 32;
+            }
+        }
+
+        uint8_t * GGML_RESTRICT ql = y[i].ql;
+        uint8_t * GGML_RESTRICT qh = y[i].qh;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                const uint8_t q1 = L[j + l +  0] & 0xF;
+                const uint8_t q2 = L[j + l + 32] & 0xF;
+                const uint8_t q3 = L[j + l + 64] & 0xF;
+                const uint8_t q4 = L[j + l + 96] & 0xF;
+                ql[l+ 0] = q1 | (q3 << 4);
+                ql[l+32] = q2 | (q4 << 4);
+                qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
+            }
+            ql += 64;
+            qh += 32;
+        }
+
+        x += QK_K;
+    }
+}
+
+static void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
+    assert(k % QK_K == 0);
+    block_q6_K * GGML_RESTRICT y = vy;
+    quantize_row_q6_K_ref(x, y, k);
+}
+
+static void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
+    assert(n % QK_K == 0);
+    assert(nrc == 1);
+            UNUSED(nrc);
+            UNUSED(bx);
+            UNUSED(by);
+            UNUSED(bs);
+
+    const block_q6_K * GGML_RESTRICT x = vx;
+    const block_q8_K * GGML_RESTRICT y = vy;
+
+    const int nb = n / QK_K;
+
+    int8_t  aux8[QK_K];
+    int16_t aux16[8];
+    float   sums [8];
+    int32_t aux32[8];
+    memset(sums, 0, 8*sizeof(float));
+
+    float sumf = 0;
+    for (int i = 0; i < nb; ++i) {
+        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
+        const uint8_t * GGML_RESTRICT qh = x[i].qh;
+        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
+        memset(aux32, 0, 8*sizeof(int32_t));
+        int8_t * GGML_RESTRICT a = aux8;
+        for (int j = 0; j < QK_K; j += 128) {
+            for (int l = 0; l < 32; ++l) {
+                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
+                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
+                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
+                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
+            }
+            a  += 128;
+            q4 += 64;
+            qh += 32;
+        }
+        a = aux8;
+        int is = 0;
+        for (int j = 0; j < QK_K/16; ++j) {
+            int scale = x[i].scales[is++];
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
+            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
+            q8 += 8; a += 8;
+        }
+        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
+        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
+    }
+    for (int l = 0; l < 8; ++l) sumf += sums[l];
+    *s = sumf;
+
+}
+
 // =================================================================================================
 //  section-4: ggml-hexagon kernel helper function
 // =================================================================================================
@@ -446,6 +1158,9 @@ int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
     tptr = (void *)malloc(1);
     *handle = (remote_handle64)tptr;
     assert(*handle);
+
+    ggml_init();
+
     return 0;
 }
 
@@ -574,19 +1289,19 @@ static void ggml_compute_forward_add_f32(
     if (nb10 == sizeof(float)) {
         for (int ir = ir0; ir < ir1; ++ir) {
             // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+            const int32_t i03 = ir/(ne02*ne01);
+            const int32_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int32_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
 
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-            const int64_t nr0 = ne00 / ne10;
+            const int32_t i13 = i03 % ne13;
+            const int32_t i12 = i02 % ne12;
+            const int32_t i11 = i01 % ne11;
+            const int32_t nr0 = ne00 / ne10;
 
             float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
             float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
             float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-            for (int64_t r = 0; r < nr0; ++r) {
+            for (int32_t r = 0; r < nr0; ++r) {
                 ggmlhexagon_dsp_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
             }
         }
@@ -594,19 +1309,19 @@ static void ggml_compute_forward_add_f32(
         // src1 is not contiguous
         for (int ir = ir0; ir < ir1; ++ir) {
             // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+            const int32_t i03 = ir/(ne02*ne01);
+            const int32_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int32_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
 
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
+            const int32_t i13 = i03 % ne13;
+            const int32_t i12 = i02 % ne12;
+            const int32_t i11 = i01 % ne11;
 
             float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
             float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
 
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                const int64_t i10 = i0 % ne10;
+            for (int32_t i0 = 0; i0 < ne0; ++i0) {
+                const int32_t i10 = i0 % ne10;
                 float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
 
                 dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
@@ -638,276 +1353,48 @@ int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tenso
     return 0;
 }
 
-static void ggml_compute_forward_mul_mat_one_chunk(
+static void ggml_compute_forward_sub_f32(
         const ggml_tensor * src0,
         const ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        const enum ggml_type type,
-        const int32_t num_rows_per_vec_dot,
-        const int32_t ir0_start,
-        const int32_t ir0_end,
-        const int32_t ir1_start,
-        const int32_t ir1_end) {
-    ggmlhexagon_dump_tensor(src0, 0);
-    ggmlhexagon_dump_tensor(src1, 0);
-    ggmlhexagon_dump_tensor(dst, 0);
-
-    dst->ne[0] = src0->ne[1];
-    dst->ne[1] = src1->ne[1];
-    dst->ne[2] = src1->ne[2];
-    dst->ne[3] = src1->ne[3];
+        struct ggml_tensor * dst) {
 
-    dst->nb[0] = ggml_type_size(src1->type);
-    dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type));
-    dst->nb[2] = dst->nb[1] * dst->ne[1];
-    dst->nb[3] = dst->nb[2] * dst->ne[2];
-    ggmlhexagon_dump_tensor(dst, 0);
+    memcpy(dst->ne, src1->ne, 16);
+    memcpy(dst->nb, src1->nb, 16);
 
-    GGML_TENSOR_BINARY_OP_LOCALS
+    assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
 
-    const bool src1_cont = ggml_is_contiguous(src1);
+    const int ith = 0;
+    const int nth = 1;
 
-    ggml_vec_dot_t const vec_dot      = type_traits_cpu[type].vec_dot;
-    enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
+    const int nr  = ggml_nrows(src0);
 
-    // broadcast factors
-    const int32_t r2 = ne12 / ne02;
-    const int32_t r3 = ne13 / ne03;
+    GGML_TENSOR_BINARY_OP_LOCALS
 
-    if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
-       return;
-    }
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
 
-    //FIXME:hardcode to src1->data
-    const void * wdata = src1->data;
-    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
 
-    assert(ne12 % ne02 == 0);
-    assert(ne13 % ne03 == 0);
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
 
-    // block-tiling attempt
-    const int32_t blck_0 = 16;
-    const int32_t blck_1 = 16;
+    if (nb10 == sizeof(float)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int64_t i03 = ir/(ne02*ne01);
+            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
 
-    const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
+            const int64_t i13 = i03 % ne13;
+            const int64_t i12 = i02 % ne12;
+            const int64_t i11 = i01 % ne11;
+            const int64_t nr0 = ne00 / ne10;
 
-    // attempt to reduce false-sharing (does not seem to make a difference)
-    // 16 * 2, accounting for mmla kernels
-    float tmp[32];
-
-    for (int32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
-        for (int32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
-            for (int32_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
-                const int32_t i13 = (ir1 / (ne12 * ne1));
-                const int32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
-                const int32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
-
-                // broadcast src0 into src1
-                const int32_t i03 = i13 / r3;
-                const int32_t i02 = i12 / r2;
-
-                const int32_t i1 = i11;
-                const int32_t i2 = i12;
-                const int32_t i3 = i13;
-
-                const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
-
-                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
-                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
-                //       the original src1 data pointer, so we should index using the indices directly
-                // TODO: this is a bit of a hack, we should probably have a better way to handle this
-                const char * src1_col = (const char*)wdata +
-                                        (src1_cont || src1->type != vec_dot_type
-                                         ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
-                                         : (i11 * nb11 + i12 * nb12 + i13 * nb13));
-                float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
-
-                //for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
-                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
-                //}
-
-                for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
-                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
-                }
-
-                for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
-                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
-                }
-            }
-        }
-    }
-}
-
- int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
-     ggmlhexagon_dump_tensor(src0, 0);
-     ggmlhexagon_dump_tensor(src1, 0);
-     ggmlhexagon_dump_tensor(dst, 0);
-
-     dst->ne[0] = src0->ne[1];
-     dst->ne[1] = src1->ne[1];
-     dst->ne[2] = src1->ne[2];
-     dst->ne[3] = src1->ne[3];
-
-     dst->nb[0] = ggml_type_size(src1->type);
-     dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type));
-     dst->nb[2] = dst->nb[1] * dst->ne[1];
-     dst->nb[3] = dst->nb[2] * dst->ne[2];
-     ggmlhexagon_dump_tensor(dst, 0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    enum ggml_type           const vec_dot_type         = type_traits_cpu[src0->type].vec_dot_type;
-    ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
-    int32_t                  const vec_dot_num_rows     = type_traits_cpu[src0->type].nrows;
-
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne12);
-    GGML_ASSERT(ne3 == ne13);
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(src0->type));
-    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-#if 0 //naive algorithm for fp32, can pass various case in UT
-    {
-        //ggml_dump_tensor(src0);
-        //ggml_dump_tensor(src1);
-
-        float * a = (float*)src0->data;
-        float * b = (float*)src1->data;
-        float * c = (float*)dst->data;
-        int M = src0->ne[1];
-        int K = src0->ne[0];
-        int N = src1->ne[1];
-        float sum = 0;
-        for (int i = 0; i < M; i++) {
-            for (int j = 0; j < N; j++) {
-                sum = 0;
-                for (int h = 0; h < K; h++) {
-                    sum += a[i * K + h] * b[h * N + j];
-                }
-                c[i * N + j] = sum;
-            }
-        }
-        return 0;
-    }
-#endif
-
-    // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
-    const int32_t nr0 = ne0;
-
-    // This is the size of the rest of the dimensions of the result
-    const int32_t nr1 = ne1 * ne2 * ne3;
-
-    // Now select a reasonable chunk size.
-    int chunk_size = 16;
-
-    // We need to step up the size if it's small
-    if (nr0 == 1 || nr1 == 1) {
-        chunk_size = 64;
-    }
-
-    // distribute the work across the inner or outer loop based on which one is larger
-    // The number of chunks in the 0/1 dim.
-    // CEIL(nr0/chunk_size)
-    int32_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
-    int32_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
-
-    // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
-    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggml-org/llama.cpp/pull/6915
-    //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
-    if (nchunk0 * nchunk1 <  4) {
-        // distribute the thread work across the inner or outer loop based on which one is larger
-        nchunk0 =  1; // parallelize by src0 rows
-        nchunk1 =  1; // parallelize by src1 rows
-    }
-
-    // The number of elements in each chunk
-    const int32_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
-    const int32_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
-
-    // The first chunk comes from our thread_id, the rest will get auto-assigned.
-    int current_chunk = 0;
-
-    while (current_chunk < nchunk0 * nchunk1) {
-        const int32_t ith0 = current_chunk % nchunk0;
-        const int32_t ith1 = current_chunk / nchunk0;
-
-        const int32_t ir0_start = dr0 * ith0;
-        const int32_t ir0_end = MIN(ir0_start + dr0, nr0);
-
-        const int32_t ir1_start = dr1 * ith1;
-        const int32_t ir1_end = MIN(ir1_start + dr1, nr1);
-
-        // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
-        int32_t num_rows_per_vec_dot = vec_dot_num_rows;
-
-        // these checks are needed to avoid crossing dim1 boundaries
-        // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
-        if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
-            num_rows_per_vec_dot = 1;
-        }
-        ggml_compute_forward_mul_mat_one_chunk(src0, src1, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
-
-        if (1 >= nchunk0 * nchunk1) {
-            break;
-        }
-        current_chunk++;
-    }
-     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-    return 0;
-}
-
-static void ggml_compute_forward_sub_f32(
-        const ggml_tensor * src0,
-        const ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-
-    memcpy(dst->ne, src1->ne, 16);
-    memcpy(dst->nb, src1->nb, 16);
-
-    assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    const int ith = 0;
-    const int nth = 1;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (nb10 == sizeof(float)) {
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-            const int64_t nr0 = ne00 / ne10;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
 
             for (int64_t r = 0; r < nr0; ++r) {
 #ifdef GGML_USE_ACCELERATE
@@ -1135,3 +1622,274 @@ int ggmlop_dsp_div(remote_handle64 h, const ggml_tensor * src0, const ggml_tenso
         }
     }
 }
+
+static void ggml_compute_forward_mul_mat_one_chunk(
+        const struct ggml_compute_params * params,
+        const ggml_tensor * src0,
+        const ggml_tensor * src1,
+        struct ggml_tensor * dst,
+        const enum ggml_type type,
+        const int32_t num_rows_per_vec_dot,
+        const int32_t ir0_start,
+        const int32_t ir0_end,
+        const int32_t ir1_start,
+        const int32_t ir1_end) {
+    ggmlhexagon_dump_tensor(src0, 0);
+    ggmlhexagon_dump_tensor(src1, 0);
+    ggmlhexagon_dump_tensor(dst, 0);
+
+    dst->ne[0] = src0->ne[1];
+    dst->ne[1] = src1->ne[1];
+    dst->ne[2] = src1->ne[2];
+    dst->ne[3] = src1->ne[3];
+
+    dst->nb[0] = ggml_type_size(src1->type);
+    dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type));
+    dst->nb[2] = dst->nb[1] * dst->ne[1];
+    dst->nb[3] = dst->nb[2] * dst->ne[2];
+    ggmlhexagon_dump_tensor(dst, 0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const bool src1_cont = ggml_is_contiguous(src1);
+
+    ggml_vec_dot_t const vec_dot      = type_traits_cpu[type].vec_dot;
+    enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
+
+    // broadcast factors
+    const int32_t r2 = ne12 / ne02;
+    const int32_t r3 = ne13 / ne03;
+
+    if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
+        return;
+    }
+
+    const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+    assert(ne12 % ne02 == 0);
+    assert(ne13 % ne03 == 0);
+
+    // block-tiling attempt
+    const int32_t blck_0 = 16;
+    const int32_t blck_1 = 16;
+
+    const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
+
+    // attempt to reduce false-sharing (does not seem to make a difference)
+    // 16 * 2, accounting for mmla kernels
+    float tmp[32];
+
+    for (int32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
+        for (int32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
+            for (int32_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
+                const int32_t i13 = (ir1 / (ne12 * ne1));
+                const int32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
+                const int32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
+
+                // broadcast src0 into src1
+                const int32_t i03 = i13 / r3;
+                const int32_t i02 = i12 / r2;
+
+                const int32_t i1 = i11;
+                const int32_t i2 = i12;
+                const int32_t i3 = i13;
+
+                const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
+
+                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                //       the original src1 data pointer, so we should index using the indices directly
+                // TODO: this is a bit of a hack, we should probably have a better way to handle this
+                const char * src1_col = (const char*)wdata +
+                                        (src1_cont || src1->type != vec_dot_type
+                                         ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
+                                         : (i11 * nb11 + i12 * nb12 + i13 * nb13));
+                float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
+
+                //for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
+                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+                //}
+
+                for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
+                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
+                }
+
+                for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
+                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
+                }
+            }
+        }
+    }
+}
+
+int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    ggmlhexagon_dump_tensor(src0, 0);
+    ggmlhexagon_dump_tensor(src1, 0);
+    ggmlhexagon_dump_tensor(dst, 0);
+
+    dst->ne[0] = src0->ne[1];
+    dst->ne[1] = src1->ne[1];
+    dst->ne[2] = src1->ne[2];
+    dst->ne[3] = src1->ne[3];
+
+    dst->nb[0] = ggml_type_size(src1->type);
+    dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type));
+    dst->nb[2] = dst->nb[1] * dst->ne[1];
+    dst->nb[3] = dst->nb[2] * dst->ne[2];
+    ggmlhexagon_dump_tensor(dst, 0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    enum ggml_type           const vec_dot_type         = type_traits_cpu[src0->type].vec_dot_type;
+    ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
+    int32_t                  const vec_dot_num_rows     = type_traits_cpu[src0->type].nrows;
+    const int ith = 0;
+    const int nth = 1;
+
+    struct ggml_compute_params params;
+    params.ith = 0;
+    params.nth = 1;
+    params.wsize = 0;
+    params.wdata = NULL;
+
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(src0->type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+#if 0 //naive algorithm for fp32, can pass various case in UT
+    {
+        //ggml_dump_tensor(src0);
+        //ggml_dump_tensor(src1);
+
+        float * a = (float*)src0->data;
+        float * b = (float*)src1->data;
+        float * c = (float*)dst->data;
+        int M = src0->ne[1];
+        int K = src0->ne[0];
+        int N = src1->ne[1];
+        float sum = 0;
+        for (int i = 0; i < M; i++) {
+            for (int j = 0; j < N; j++) {
+                sum = 0;
+                for (int h = 0; h < K; h++) {
+                    sum += a[i * K + h] * b[h * N + j];
+                }
+                c[i * N + j] = sum;
+            }
+        }
+        return 0;
+    }
+#endif
+
+    if (src1->type != vec_dot_type) {
+        params.wsize = ggml_row_size(vec_dot_type, ggml_nelements(src1));
+        params.wdata = (char*)malloc(params.wsize);
+    }
+
+    if (src1->type != vec_dot_type) {
+        char * wdata = params.wdata;
+
+        const size_t nbw0 = ggml_type_size(vec_dot_type);
+        const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
+        const size_t nbw2 = nbw1*ne11;
+        const size_t nbw3 = nbw2*ne12;
+
+        assert(params.wsize >= ne13*nbw3);
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+        for (int64_t i13 = 0; i13 < ne13; ++i13) {
+            for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                for (int64_t i11 = 0; i11 < ne11; ++i11) {
+                    size_t bs = ggml_blck_size(vec_dot_type);
+                    int64_t ne10_block_start = (ith * ne10/bs) / nth;
+                    int64_t ne10_block_end   = ((ith + 1) * ne10/bs) / nth;
+                    from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
+                               (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
+                               (ne10_block_end - ne10_block_start) * bs);
+                }
+            }
+        }
+    }
+
+
+    // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
+    const int32_t nr0 = ne0;
+
+    // This is the size of the rest of the dimensions of the result
+    const int32_t nr1 = ne1 * ne2 * ne3;
+
+    // Now select a reasonable chunk size.
+    int chunk_size = 16;
+
+    // We need to step up the size if it's small
+    if (nr0 == 1 || nr1 == 1) {
+        chunk_size = 64;
+    }
+
+    // distribute the work across the inner or outer loop based on which one is larger
+    // The number of chunks in the 0/1 dim.
+    // CEIL(nr0/chunk_size)
+    int32_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
+    int32_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
+
+    // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
+    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggml-org/llama.cpp/pull/6915
+    //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
+    if (nchunk0 * nchunk1 <  4) {
+        // distribute the thread work across the inner or outer loop based on which one is larger
+        nchunk0 =  1; // parallelize by src0 rows
+        nchunk1 =  1; // parallelize by src1 rows
+    }
+
+    // The number of elements in each chunk
+    const int32_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
+    const int32_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
+
+    // The first chunk comes from our thread_id, the rest will get auto-assigned.
+    int current_chunk = 0;
+
+    while (current_chunk < nchunk0 * nchunk1) {
+        const int32_t ith0 = current_chunk % nchunk0;
+        const int32_t ith1 = current_chunk / nchunk0;
+
+        const int32_t ir0_start = dr0 * ith0;
+        const int32_t ir0_end = MIN(ir0_start + dr0, nr0);
+
+        const int32_t ir1_start = dr1 * ith1;
+        const int32_t ir1_end = MIN(ir1_start + dr1, nr1);
+
+        // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
+        int32_t num_rows_per_vec_dot = vec_dot_num_rows;
+
+        // these checks are needed to avoid crossing dim1 boundaries
+        // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
+        if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
+            num_rows_per_vec_dot = 1;
+        }
+        ggml_compute_forward_mul_mat_one_chunk(&params, src0, src1, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
+
+        if (1 >= nchunk0 * nchunk1) {
+            break;
+        }
+        current_chunk++;
+    }
+    if (src1->type != vec_dot_type) {
+        free(params.wdata);
+    }
+
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
\ No newline at end of file
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index 75f3ab0d9d933..ce8d57938f72d 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -33,5 +33,5 @@ precision_mode = "fp16"
 [cdsp]
 #enable/disable offload mulmat to cDSP
 enable_mulmat_cdsp = 1
-#enable/disable offload fp32 & all quantized type mulmat to cDSP
+#enable/disable offload fp32 & quantized type mulmat to cDSP
 enable_q_mulmat = 0

From beab63f072ca7200451913f49778d6811e86d1b9 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sat, 29 Mar 2025 23:43:48 +0800
Subject: [PATCH 145/200] ggml-hexagon: fix minior issue in ggml-hexagon.cpp
 after self code-review

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 85e0e4e1e3388..316745933c5ea 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -330,7 +330,7 @@ struct hexagon_appcfg_t {
     int hwaccel_approach;       // 0: HWACCEL_QNN 1: HWACCEL_QNN_SINGLEGRAPH 2: HWACCEL_CDSP
     int hexagon_backend;        // 0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP
     int enable_mulmat_cdsp;     // enable/disable offload mulmat to cDSP
-    int enable_q_mulmat;        // enable/disable offload fp32 & all quantized type mulmat to cDSP
+    int enable_q_mulmat;        // enable/disable offload fp32 & quantized mulmat to cDSP
     const char * cfgfilename;
     const char * runtimelib_path;
 };
@@ -1395,6 +1395,7 @@ static void * ggmlhexagon_type_trait(ggml_backend_hexagon_context * ctx, ggml_te
 }
 
 static void ggmlhexagon_set_runtime_path(size_t device, const std::string & path) {
+#if defined(__ANDROID__)
     if ((HEXAGON_BACKEND_QNNNPU == device) || (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach)) {
         if (0 == setenv("LD_LIBRARY_PATH",
                         (path +
@@ -1424,6 +1425,7 @@ static void ggmlhexagon_set_runtime_path(size_t device, const std::string & path
                                   ggml_backend_hexagon_get_devname(device));
         }
     }
+#endif
 }
 
 static void ggmlhexagon_load_cfg() {
@@ -4874,7 +4876,8 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     }
 
     return 0;
-    bail:
+
+bail:
     if (ggmlop_domain_uri) {
         free(ggmlop_domain_uri);
     }
@@ -5344,7 +5347,7 @@ static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
 
 static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_type_t buft) {
     GGML_UNUSED(buft);
-    return "qnn-buffer";
+    return "hexagon-buffer";
 }
 
 static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
@@ -5399,7 +5402,7 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
     ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)backend->context;
 
     qnn_instance * instance = (qnn_instance*)g_hexagon_mgr[ctx->device].instance;
-    if (instance != nullptr) {
+    if (nullptr != instance) {
         std::map<std::string, qnn_singlenode_res_t>::iterator singlenode_graph_it;
         for (singlenode_graph_it = ctx->qnn_singlenode_graph_map.begin();
              singlenode_graph_it != ctx->qnn_singlenode_graph_map.end(); singlenode_graph_it++) {
@@ -5856,11 +5859,9 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * qnn_lib_pat
         return nullptr;
     }
 
-#if defined(__ANDROID__)
     std::string path = qnn_lib_path;
     GGMLHEXAGON_LOG_DEBUG("lib_path %s", path.c_str());
     ggmlhexagon_set_runtime_path(device, path);
-#endif
 
     if (nullptr != g_hexagon_mgr[device].backend) {
         GGMLHEXAGON_LOG_DEBUG("backend %d(%s) already loaded", device,
@@ -5893,7 +5894,7 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * qnn_lib_pat
             ggml_backend_hexagon_free(hexagon_backend);
             return nullptr;
         }
-        //ensure test-backend-ops get the correct backend name when inference approach is 1(HWACCEL_CDSP)
+        //ensure test-backend-ops get the correct backend name when hwaccel approach is 2(HWACCEL_CDSP)
         memcpy(g_hexagon_mgr[device].name, "Hexagon-cDSP", strlen("Hexagon-cDSP"));
     } else {
         //got fully description of SoC when hwaccel approach is HWACCEL_QNN and backend is HEXAGON_BACKEND_QNNNPU

From 6bd423105d0af827442a1e0f07e0c878112e30be Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 30 Mar 2025 16:38:31 +0800
Subject: [PATCH 146/200] ggml-hexagon: check validation of ggml-hexagon.cfg
 before create appropriate backend

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp      | 31 +++++++++++++++++----
 ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c |  2 +-
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 316745933c5ea..b2e5fed694b1e 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1128,6 +1128,7 @@ static const char * ggmlhexagon_get_socmodel_desc(uint32_t soc_model) {
     }
 }
 
+//0x68 -> 68, 0x69 -> 69, 0x73 -> 73, 0x75 -> 75, 0x79 -> 79
 static size_t ggmlhexagon_htparch_hex_to_decimal(size_t htp_arch) {
     //naive algorithm
     int a = htp_arch / 16;
@@ -1474,6 +1475,27 @@ static void ggmlhexagon_load_cfg() {
     initialized = true;
 }
 
+static bool ggmlhexagon_check_valid_appcfg() {
+    bool is_valid_appcfg = true;
+
+    if (HWACCEL_QNN_SINGLEGRAPH == g_hexagon_appcfg.hwaccel_approach) {
+        GGMLHEXAGON_LOG_INFO("HWACCEL_QNN_SINGLEGRAPH not supported");
+        is_valid_appcfg = false;
+    }
+
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        if (HEXAGON_BACKEND_CDSP != g_hexagon_appcfg.hexagon_backend) {
+            GGMLHEXAGON_LOG_INFO("hwaccel_approach HWACCEL_CDSP must match with hexagon_backend HEXAGON_BACKEND_CDSP");
+            is_valid_appcfg = false;
+        }
+    }
+
+    if (!is_valid_appcfg) {
+        GGMLHEXAGON_LOG_INFO("it seems there is wrong configuration in ggml-hexagon.cfg, will using the default ggml backend accordingly");
+    }
+    return is_valid_appcfg;
+}
+
 // =================================================================================================
 //  section-5: QNN helper function
 // =================================================================================================
@@ -5553,8 +5575,7 @@ static ggml_backend_t ggml_backend_hexagon_device_init_backend(ggml_backend_dev_
     GGMLHEXAGON_LOG_DEBUG("user's specified hexagon_backend in cfgfile = %d", g_hexagon_appcfg.hexagon_backend);
     GGMLHEXAGON_LOG_DEBUG("user's sepcified qnn runtime lib path in cfgfile = %s", g_hexagon_appcfg.runtimelib_path);
 
-    if (HWACCEL_QNN_SINGLEGRAPH == g_hexagon_appcfg.hwaccel_approach) {
-        GGMLHEXAGON_LOG_INFO("HWACCEL_QNN_SINGLEGRAPH not supported, using default ggml backend");
+    if (!ggmlhexagon_check_valid_appcfg()) {
         return nullptr;
     }
 
@@ -5740,8 +5761,7 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
         return nullptr;
     }
 
-    if (HWACCEL_QNN_SINGLEGRAPH == g_hexagon_appcfg.hwaccel_approach) {
-        GGMLHEXAGON_LOG_INFO("HWACCEL_QNN_SINGLEGRAPH not supported, using default ggml backend");
+    if (!ggmlhexagon_check_valid_appcfg()) {
         return nullptr;
     }
 
@@ -5847,8 +5867,7 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * qnn_lib_pat
     if (nullptr == qnn_lib_path)
         return nullptr;
 
-    if (HWACCEL_QNN_SINGLEGRAPH == g_hexagon_appcfg.hwaccel_approach) {
-        GGMLHEXAGON_LOG_INFO("HWACCEL_QNN_SINGLEGRAPH not supported, using default ggml backend");
+    if (!ggmlhexagon_check_valid_appcfg()) {
         return nullptr;
     }
 
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
index c0f04c4935c4d..a526d44e2260c 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
@@ -1892,4 +1892,4 @@ int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_te
 
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
     return 0;
-}
\ No newline at end of file
+}

From 6301f291c80771b4fc6864697010467c029e91ad Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 30 Mar 2025 22:11:56 +0800
Subject: [PATCH 147/200] ggml-hexagon: fix all compiler warnings in
 ggml-hexagon.cpp

---
 ggml/src/ggml-hexagon/CMakeLists.txt          |   3 +
 ggml/src/ggml-hexagon/ggml-hexagon.cpp        | 535 +++++++++---------
 .../src/ggml-hexagon/kernels/ggmlop_ap_skel.c |   2 -
 .../src/ggml-hexagon/kernels/ggmlop_ap_skel.h |   9 +-
 ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c   |   7 +-
 5 files changed, 266 insertions(+), 290 deletions(-)

diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index 7daedaa755c78..1814cdf4bb194 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -2,6 +2,9 @@ project(ggml-hexagon)
 message(STATUS "Using HEXAGON backend")
 message("CMAKE_SYSTEM_NAME : ${CMAKE_SYSTEM_NAME}")
 
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
 if(NOT DEFINED QNN_SDK_PATH)
     message(FATAL_ERROR "QNN_SDK_PATH not defined")
 endif()
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index b2e5fed694b1e..c460f0de66db8 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -8,7 +8,7 @@
  *
  * this single-source-file or self-contained implementation of ggml-hexagon backend has 8 sections:
  * section-1  forward/prototype declaration, global vars, macros, data structures
- * section-2  ggml-qnn internal troubleshooting function/class
+ * section-2  internal troubleshooting function/class
  * section-3  helper function for WoA(Windows on ARM)
  * section-4  general helper function
  * section-5  QNN helper function
@@ -442,7 +442,7 @@ static struct qcom_socinfo g_qnn_soc_info_table[] = {
 // DSP - Choose a quantized model. Quantized models are required when running on the DSP backend
 // HTA - Choose a quantized model. Quantized models are required when running on the HTA backend
 static struct ggml_backend_hexagon_context g_hexagon_mgr[GGML_HEXAGON_MAX_DEVICES] = {
-        [HEXAGON_BACKEND_QNNCPU] = {.device               = 0,
+        {       .device               = 0,
                 .name                 = "qnn-cpu",
                 .desc                 = "Qualcomm Kryo CPU",
 #if !defined(__ANDROID__) && !defined(__linux__)
@@ -456,7 +456,7 @@ static struct ggml_backend_hexagon_context g_hexagon_mgr[GGML_HEXAGON_MAX_DEVICE
                 .raw_system_interface = {},
                 .socinfo              = {}},
 
-        [HEXAGON_BACKEND_QNNGPU] = {.device               = 1,
+        {       .device               = 1,
                 .name                 = "qnn-gpu",
                 .desc                 = "Qualcomm Adreno GPU",
 #if !defined(__ANDROID__) && !defined(__linux__)
@@ -470,7 +470,7 @@ static struct ggml_backend_hexagon_context g_hexagon_mgr[GGML_HEXAGON_MAX_DEVICE
                 .raw_system_interface = {},
                 .socinfo              = {}},
 
-        [HEXAGON_BACKEND_QNNNPU] = {.device               = 2,
+        {       .device               = 2,
                 .name                 = "qnn-npu",
                 .desc                 = "Qualcomm NPU(Hexagon Tensor Processor)",
 #if !defined(__ANDROID__) && !defined(__linux__)
@@ -495,222 +495,222 @@ static domain hexagon_supported_domains[] = {
 
 //supported ggml op by HWACCEL_QNN
 static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = {
-        {true,  GGML_OP_NONE, 0},
-        {false, GGML_OP_DUP},
+        {true,  GGML_OP_NONE, 0, nullptr},
+        {false, GGML_OP_DUP, 0, nullptr},
         {true,  GGML_OP_ADD, 2, QNN_OP_ELEMENT_WISE_ADD},
-        {false, GGML_OP_ADD1},
-        {false, GGML_OP_ACC},
+        {false, GGML_OP_ADD1, 0, nullptr},
+        {false, GGML_OP_ACC, 0, nullptr},
         {true,  GGML_OP_SUB, 2, QNN_OP_ELEMENT_WISE_SUBTRACT},
         {true,  GGML_OP_MUL, 2, QNN_OP_ELEMENT_WISE_MULTIPLY},
         {true,  GGML_OP_DIV, 2, QNN_OP_ELEMENT_WISE_DIVIDE},
-        {false, GGML_OP_SQR},
+        {false, GGML_OP_SQR, 0, nullptr},
         {true,  GGML_OP_SQRT, 1, QNN_OP_ELEMENT_WISE_SQUARE_ROOT},
         {true,  GGML_OP_LOG, 1, QNN_OP_ELEMENT_WISE_LOG},
-        {false, GGML_OP_SIN},
-        {false, GGML_OP_COS},
-        {false, GGML_OP_SUM},
-        {false, GGML_OP_SUM_ROWS},
-        {false, GGML_OP_MEAN},
-        {false, GGML_OP_ARGMAX},
-        {false, GGML_OP_COUNT_EQUAL},
-        {false, GGML_OP_REPEAT},
-        {false, GGML_OP_REPEAT_BACK},
-        {false, GGML_OP_CONCAT},
-        {false, GGML_OP_SILU_BACK},
-        {false, GGML_OP_NORM},
-        {false, GGML_OP_RMS_NORM},
-        {false, GGML_OP_RMS_NORM_BACK},
-        {false, GGML_OP_GROUP_NORM},
-        {false, GGML_OP_L2_NORM},
+        {false, GGML_OP_SIN, 0, nullptr},
+        {false, GGML_OP_COS, 0, nullptr},
+        {false, GGML_OP_SUM, 0, nullptr},
+        {false, GGML_OP_SUM_ROWS, 0, nullptr},
+        {false, GGML_OP_MEAN, 0, nullptr},
+        {false, GGML_OP_ARGMAX, 0, nullptr},
+        {false, GGML_OP_COUNT_EQUAL, 0, nullptr},
+        {false, GGML_OP_REPEAT, 0, nullptr},
+        {false, GGML_OP_REPEAT_BACK, 0, nullptr},
+        {false, GGML_OP_CONCAT, 0, nullptr},
+        {false, GGML_OP_SILU_BACK, 0, nullptr},
+        {false, GGML_OP_NORM, 0, nullptr},
+        {false, GGML_OP_RMS_NORM, 0, nullptr},
+        {false, GGML_OP_RMS_NORM_BACK, 0, nullptr},
+        {false, GGML_OP_GROUP_NORM, 0, nullptr},
+        {false, GGML_OP_L2_NORM, 0, nullptr},
         {true,  GGML_OP_MUL_MAT, 2, QNN_OP_MAT_MUL},
-        {false, GGML_OP_MUL_MAT_ID},
-        {false, GGML_OP_OUT_PROD},
-        {false, GGML_OP_SCALE},
-        {false, GGML_OP_SET},
-        {false, GGML_OP_CPY},
-        {false, GGML_OP_CONT},
-        {false, GGML_OP_RESHAPE},
-        {false, GGML_OP_VIEW},
-        {false, GGML_OP_PERMUTE},
-        {false, GGML_OP_TRANSPOSE},
-        {false, GGML_OP_GET_ROWS},
-        {false, GGML_OP_GET_ROWS_BACK},
-        {false, GGML_OP_DIAG},
-        {false, GGML_OP_DIAG_MASK_INF},
-        {false, GGML_OP_DIAG_MASK_ZERO},
-        {false, GGML_OP_SOFT_MAX},
-        {false, GGML_OP_SOFT_MAX_BACK},
-        {false, GGML_OP_ROPE},
-        {false, GGML_OP_ROPE_BACK},
-        {false, GGML_OP_CLAMP},
-        {false, GGML_OP_CONV_TRANSPOSE_1D},
-        {false, GGML_OP_IM2COL},
-        {false, GGML_OP_IM2COL_BACK},
-        {false, GGML_OP_CONV_TRANSPOSE_2D},
-        {false, GGML_OP_POOL_1D},
-        {false, GGML_OP_POOL_2D},
-        {false, GGML_OP_POOL_2D_BACK},
-        {false, GGML_OP_UPSCALE},
-        {false, GGML_OP_PAD},
-        {false, GGML_OP_PAD_REFLECT_1D},
-        {false, GGML_OP_ARANGE},
-        {false, GGML_OP_TIMESTEP_EMBEDDING},
-        {false, GGML_OP_ARGSORT},
-        {false, GGML_OP_LEAKY_RELU},
-        {false, GGML_OP_FLASH_ATTN_EXT},
-        {false, GGML_OP_FLASH_ATTN_BACK},
-        {false, GGML_OP_SSM_CONV},
-        {false, GGML_OP_SSM_SCAN},
-        {false, GGML_OP_WIN_PART},
-        {false, GGML_OP_WIN_UNPART},
-        {false, GGML_OP_GET_REL_POS},
-        {false, GGML_OP_ADD_REL_POS},
-        {false, GGML_OP_RWKV_WKV6},
-        {false, GGML_OP_GATED_LINEAR_ATTN},
-        {false, GGML_OP_RWKV_WKV7},
-        {false, GGML_OP_UNARY},
-        {false, GGML_OP_MAP_UNARY},
-        {false, GGML_OP_MAP_BINARY},
-        {false, GGML_OP_MAP_CUSTOM1_F32},
-        {false, GGML_OP_MAP_CUSTOM2_F32},
-        {false, GGML_OP_MAP_CUSTOM3_F32},
-        {false, GGML_OP_MAP_CUSTOM1},
-        {false, GGML_OP_MAP_CUSTOM2},
-        {false, GGML_OP_MAP_CUSTOM3},
-        {false, GGML_OP_CROSS_ENTROPY_LOSS},
-        {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK},
-        {false, GGML_OP_OPT_STEP_ADAMW},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_ABS)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_SGN)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_NEG)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_STEP)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_TANH)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_ELU)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_RELU)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_SIGMOID)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU_QUICK)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_SILU)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSWISH)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSIGMOID)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_EXP)}
+        {false, GGML_OP_MUL_MAT_ID, 0, nullptr},
+        {false, GGML_OP_OUT_PROD, 0, nullptr},
+        {false, GGML_OP_SCALE, 0, nullptr},
+        {false, GGML_OP_SET, 0, nullptr},
+        {false, GGML_OP_CPY, 0, nullptr},
+        {false, GGML_OP_CONT, 0, nullptr},
+        {false, GGML_OP_RESHAPE, 0, nullptr},
+        {false, GGML_OP_VIEW, 0, nullptr},
+        {false, GGML_OP_PERMUTE, 0, nullptr},
+        {false, GGML_OP_TRANSPOSE, 0, nullptr},
+        {false, GGML_OP_GET_ROWS, 0, nullptr},
+        {false, GGML_OP_GET_ROWS_BACK, 0, nullptr},
+        {false, GGML_OP_DIAG, 0, nullptr},
+        {false, GGML_OP_DIAG_MASK_INF, 0, nullptr},
+        {false, GGML_OP_DIAG_MASK_ZERO, 0, nullptr},
+        {false, GGML_OP_SOFT_MAX, 0, nullptr},
+        {false, GGML_OP_SOFT_MAX_BACK, 0, nullptr},
+        {false, GGML_OP_ROPE, 0, nullptr},
+        {false, GGML_OP_ROPE_BACK, 0, nullptr},
+        {false, GGML_OP_CLAMP, 0, nullptr},
+        {false, GGML_OP_CONV_TRANSPOSE_1D, 0, nullptr},
+        {false, GGML_OP_IM2COL, 0, nullptr},
+        {false, GGML_OP_IM2COL_BACK, 0, nullptr},
+        {false, GGML_OP_CONV_TRANSPOSE_2D, 0, nullptr},
+        {false, GGML_OP_POOL_1D, 0, nullptr},
+        {false, GGML_OP_POOL_2D, 0, nullptr},
+        {false, GGML_OP_POOL_2D_BACK, 0, nullptr},
+        {false, GGML_OP_UPSCALE, 0, nullptr},
+        {false, GGML_OP_PAD, 0, nullptr},
+        {false, GGML_OP_PAD_REFLECT_1D, 0, nullptr},
+        {false, GGML_OP_ARANGE, 0, nullptr},
+        {false, GGML_OP_TIMESTEP_EMBEDDING, 0, nullptr},
+        {false, GGML_OP_ARGSORT, 0, nullptr},
+        {false, GGML_OP_LEAKY_RELU, 0, nullptr},
+        {false, GGML_OP_FLASH_ATTN_EXT, 0, nullptr},
+        {false, GGML_OP_FLASH_ATTN_BACK, 0, nullptr},
+        {false, GGML_OP_SSM_CONV, 0, nullptr},
+        {false, GGML_OP_SSM_SCAN, 0, nullptr},
+        {false, GGML_OP_WIN_PART, 0, nullptr},
+        {false, GGML_OP_WIN_UNPART, 0, nullptr},
+        {false, GGML_OP_GET_REL_POS, 0, nullptr},
+        {false, GGML_OP_ADD_REL_POS, 0, nullptr},
+        {false, GGML_OP_RWKV_WKV6, 0, nullptr},
+        {false, GGML_OP_GATED_LINEAR_ATTN, 0, nullptr},
+        {false, GGML_OP_RWKV_WKV7, 0, nullptr},
+        {false, GGML_OP_UNARY, 0, nullptr},
+        {false, GGML_OP_MAP_UNARY, 0, nullptr},
+        {false, GGML_OP_MAP_BINARY, 0, nullptr},
+        {false, GGML_OP_MAP_CUSTOM1_F32, 0, nullptr},
+        {false, GGML_OP_MAP_CUSTOM2_F32, 0, nullptr},
+        {false, GGML_OP_MAP_CUSTOM3_F32, 0, nullptr},
+        {false, GGML_OP_MAP_CUSTOM1, 0, nullptr},
+        {false, GGML_OP_MAP_CUSTOM2, 0, nullptr},
+        {false, GGML_OP_MAP_CUSTOM3, 0, nullptr},
+        {false, GGML_OP_CROSS_ENTROPY_LOSS, 0, nullptr},
+        {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK, 0, nullptr},
+        {false, GGML_OP_OPT_STEP_ADAMW, 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_ABS), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SGN), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_NEG), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_STEP), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_TANH), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_ELU), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_RELU), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SIGMOID), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU_QUICK), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SILU), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSWISH), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSIGMOID), 0, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_EXP), 0, nullptr}
 };
 
 static_assert(ggmlqnn_k_op_caps[GGML_OP_NONE].supported,    "GGML_OP_NONE is not true");
 static_assert(ggmlqnn_k_op_caps[GGML_OP_ADD].supported,     "GGML_OP_ADD is not true");
 static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL].supported,     "GGML_OP_MUL is not true");
 static_assert(ggmlqnn_k_op_caps[GGML_OP_MUL_MAT].supported, "GGML_OP_MUL_MAT is not true");
-static_assert(std::size(ggmlqnn_k_op_caps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
+static_assert(std::size(ggmlqnn_k_op_caps) == (static_cast<size_t>(GGML_OP_COUNT) + static_cast<size_t>(GGML_UNARY_OP_COUNT)),
               "pls check ggmlqnn_k_op_caps and ensure is corresponding to latest ggml.h");
 
 //supported ggml op by HWACCEL_CDSP
 static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
-        {true,  GGML_OP_NONE, 0},
-        {false, GGML_OP_DUP},
+        {true,  GGML_OP_NONE, 0, nullptr, nullptr},
+        {false, GGML_OP_DUP, 0, nullptr, nullptr},
         {true,  GGML_OP_ADD, 2, "ggmlop_dsp_add", ggmlop_dsp_add},
-        {false, GGML_OP_ADD1},
-        {false, GGML_OP_ACC},
+        {false, GGML_OP_ADD1, 0, nullptr, nullptr},
+        {false, GGML_OP_ACC, 0, nullptr, nullptr},
         {true,  GGML_OP_SUB, 2, "ggmlop_dsp_sub", ggmlop_dsp_sub},
         {true,  GGML_OP_MUL, 2, "ggmlop_dsp_mul", ggmlop_dsp_mul},
         {true,  GGML_OP_DIV, 2, "ggmlop_dsp_div", ggmlop_dsp_div},
-        {false, GGML_OP_SQR},
-        {false,  GGML_OP_SQRT, 1},
-        {false,  GGML_OP_LOG, 1},
-        {false, GGML_OP_SIN},
-        {false, GGML_OP_COS},
-        {false, GGML_OP_SUM},
-        {false, GGML_OP_SUM_ROWS},
-        {false, GGML_OP_MEAN},
-        {false, GGML_OP_ARGMAX},
-        {false, GGML_OP_COUNT_EQUAL},
-        {false, GGML_OP_REPEAT},
-        {false, GGML_OP_REPEAT_BACK},
-        {false, GGML_OP_CONCAT},
-        {false, GGML_OP_SILU_BACK},
-        {false, GGML_OP_NORM},
-        {false, GGML_OP_RMS_NORM},
-        {false, GGML_OP_RMS_NORM_BACK},
-        {false, GGML_OP_GROUP_NORM},
-        {false, GGML_OP_L2_NORM},
+        {false, GGML_OP_SQR, 0, nullptr, nullptr},
+        {false,  GGML_OP_SQRT, 0, nullptr, nullptr},
+        {false,  GGML_OP_LOG, 0, nullptr, nullptr},
+        {false, GGML_OP_SIN, 0, nullptr, nullptr},
+        {false, GGML_OP_COS, 0, nullptr, nullptr},
+        {false, GGML_OP_SUM, 0, nullptr, nullptr},
+        {false, GGML_OP_SUM_ROWS, 0, nullptr, nullptr},
+        {false, GGML_OP_MEAN, 0, nullptr, nullptr},
+        {false, GGML_OP_ARGMAX, 0, nullptr, nullptr},
+        {false, GGML_OP_COUNT_EQUAL, 0, nullptr, nullptr},
+        {false, GGML_OP_REPEAT, 0, nullptr, nullptr},
+        {false, GGML_OP_REPEAT_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_CONCAT, 0, nullptr, nullptr},
+        {false, GGML_OP_SILU_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_NORM, 0, nullptr, nullptr},
+        {false, GGML_OP_RMS_NORM, 0, nullptr, nullptr},
+        {false, GGML_OP_RMS_NORM_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_GROUP_NORM, 0, nullptr, nullptr},
+        {false, GGML_OP_L2_NORM, 0, nullptr, nullptr},
         {true,  GGML_OP_MUL_MAT, 2, "ggmlop_dsp_mulmat", ggmlop_dsp_mulmat},
-        {false, GGML_OP_MUL_MAT_ID},
-        {false, GGML_OP_OUT_PROD},
-        {false, GGML_OP_SCALE},
-        {false, GGML_OP_SET},
-        {false, GGML_OP_CPY},
-        {false, GGML_OP_CONT},
-        {false, GGML_OP_RESHAPE},
-        {false, GGML_OP_VIEW},
-        {false, GGML_OP_PERMUTE},
-        {false, GGML_OP_TRANSPOSE},
-        {false, GGML_OP_GET_ROWS},
-        {false, GGML_OP_GET_ROWS_BACK},
-        {false, GGML_OP_DIAG},
-        {false, GGML_OP_DIAG_MASK_INF},
-        {false, GGML_OP_DIAG_MASK_ZERO},
-        {false, GGML_OP_SOFT_MAX},
-        {false, GGML_OP_SOFT_MAX_BACK},
-        {false, GGML_OP_ROPE},
-        {false, GGML_OP_ROPE_BACK},
-        {false, GGML_OP_CLAMP},
-        {false, GGML_OP_CONV_TRANSPOSE_1D},
-        {false, GGML_OP_IM2COL},
-        {false, GGML_OP_IM2COL_BACK},
-        {false, GGML_OP_CONV_TRANSPOSE_2D},
-        {false, GGML_OP_POOL_1D},
-        {false, GGML_OP_POOL_2D},
-        {false, GGML_OP_POOL_2D_BACK},
-        {false, GGML_OP_UPSCALE},
-        {false, GGML_OP_PAD},
-        {false, GGML_OP_PAD_REFLECT_1D},
-        {false, GGML_OP_ARANGE},
-        {false, GGML_OP_TIMESTEP_EMBEDDING},
-        {false, GGML_OP_ARGSORT},
-        {false, GGML_OP_LEAKY_RELU},
-        {false, GGML_OP_FLASH_ATTN_EXT},
-        {false, GGML_OP_FLASH_ATTN_BACK},
-        {false, GGML_OP_SSM_CONV},
-        {false, GGML_OP_SSM_SCAN},
-        {false, GGML_OP_WIN_PART},
-        {false, GGML_OP_WIN_UNPART},
-        {false, GGML_OP_GET_REL_POS},
-        {false, GGML_OP_ADD_REL_POS},
-        {false, GGML_OP_RWKV_WKV6},
-        {false, GGML_OP_GATED_LINEAR_ATTN},
-        {false, GGML_OP_RWKV_WKV7},
-        {false, GGML_OP_UNARY},
-        {false, GGML_OP_MAP_UNARY},
-        {false, GGML_OP_MAP_BINARY},
-        {false, GGML_OP_MAP_CUSTOM1_F32},
-        {false, GGML_OP_MAP_CUSTOM2_F32},
-        {false, GGML_OP_MAP_CUSTOM3_F32},
-        {false, GGML_OP_MAP_CUSTOM1},
-        {false, GGML_OP_MAP_CUSTOM2},
-        {false, GGML_OP_MAP_CUSTOM3},
-        {false, GGML_OP_CROSS_ENTROPY_LOSS},
-        {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK},
-        {false, GGML_OP_OPT_STEP_ADAMW},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_ABS)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_SGN)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_NEG)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_STEP)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_TANH)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_ELU)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_RELU)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_SIGMOID)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU_QUICK)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_SILU)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSWISH)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSIGMOID)},
-        {false, static_cast<ggml_op>(GGML_UNARY_OP_EXP)}
+        {false, GGML_OP_MUL_MAT_ID, 0, nullptr, nullptr},
+        {false, GGML_OP_OUT_PROD, 0, nullptr, nullptr},
+        {false, GGML_OP_SCALE, 0, nullptr, nullptr},
+        {false, GGML_OP_SET, 0, nullptr, nullptr},
+        {false, GGML_OP_CPY, 0, nullptr, nullptr},
+        {false, GGML_OP_CONT, 0, nullptr, nullptr},
+        {false, GGML_OP_RESHAPE, 0, nullptr, nullptr},
+        {false, GGML_OP_VIEW, 0, nullptr, nullptr},
+        {false, GGML_OP_PERMUTE, 0, nullptr, nullptr},
+        {false, GGML_OP_TRANSPOSE, 0, nullptr, nullptr},
+        {false, GGML_OP_GET_ROWS, 0, nullptr, nullptr},
+        {false, GGML_OP_GET_ROWS_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_DIAG, 0, nullptr, nullptr},
+        {false, GGML_OP_DIAG_MASK_INF, 0, nullptr, nullptr},
+        {false, GGML_OP_DIAG_MASK_ZERO, 0, nullptr, nullptr},
+        {false, GGML_OP_SOFT_MAX, 0, nullptr, nullptr},
+        {false, GGML_OP_SOFT_MAX_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_ROPE, 0, nullptr, nullptr},
+        {false, GGML_OP_ROPE_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_CLAMP, 0, nullptr, nullptr},
+        {false, GGML_OP_CONV_TRANSPOSE_1D, 0, nullptr, nullptr},
+        {false, GGML_OP_IM2COL, 0, nullptr, nullptr},
+        {false, GGML_OP_IM2COL_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_CONV_TRANSPOSE_2D, 0, nullptr, nullptr},
+        {false, GGML_OP_POOL_1D, 0, nullptr, nullptr},
+        {false, GGML_OP_POOL_2D, 0, nullptr, nullptr},
+        {false, GGML_OP_POOL_2D_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_UPSCALE, 0, nullptr, nullptr},
+        {false, GGML_OP_PAD, 0, nullptr, nullptr},
+        {false, GGML_OP_PAD_REFLECT_1D, 0, nullptr, nullptr},
+        {false, GGML_OP_ARANGE, 0, nullptr, nullptr},
+        {false, GGML_OP_TIMESTEP_EMBEDDING, 0, nullptr, nullptr},
+        {false, GGML_OP_ARGSORT, 0, nullptr, nullptr},
+        {false, GGML_OP_LEAKY_RELU, 0, nullptr, nullptr},
+        {false, GGML_OP_FLASH_ATTN_EXT, 0, nullptr, nullptr},
+        {false, GGML_OP_FLASH_ATTN_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_SSM_CONV, 0, nullptr, nullptr},
+        {false, GGML_OP_SSM_SCAN, 0, nullptr, nullptr},
+        {false, GGML_OP_WIN_PART, 0, nullptr, nullptr},
+        {false, GGML_OP_WIN_UNPART, 0, nullptr, nullptr},
+        {false, GGML_OP_GET_REL_POS, 0, nullptr, nullptr},
+        {false, GGML_OP_ADD_REL_POS, 0, nullptr, nullptr},
+        {false, GGML_OP_RWKV_WKV6, 0, nullptr, nullptr},
+        {false, GGML_OP_GATED_LINEAR_ATTN, 0, nullptr, nullptr},
+        {false, GGML_OP_RWKV_WKV7, 0, nullptr, nullptr},
+        {false, GGML_OP_UNARY, 0, nullptr, nullptr},
+        {false, GGML_OP_MAP_UNARY, 0, nullptr, nullptr},
+        {false, GGML_OP_MAP_BINARY, 0, nullptr, nullptr},
+        {false, GGML_OP_MAP_CUSTOM1_F32, 0, nullptr, nullptr},
+        {false, GGML_OP_MAP_CUSTOM2_F32, 0, nullptr, nullptr},
+        {false, GGML_OP_MAP_CUSTOM3_F32, 0, nullptr, nullptr},
+        {false, GGML_OP_MAP_CUSTOM1, 0, nullptr, nullptr},
+        {false, GGML_OP_MAP_CUSTOM2, 0, nullptr, nullptr},
+        {false, GGML_OP_MAP_CUSTOM3, 0, nullptr, nullptr},
+        {false, GGML_OP_CROSS_ENTROPY_LOSS, 0, nullptr, nullptr},
+        {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_OPT_STEP_ADAMW, 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_ABS), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SGN), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_NEG), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_STEP), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_TANH), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_ELU), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_RELU), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SIGMOID), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_GELU_QUICK), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_SILU), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSWISH), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_HARDSIGMOID), 0, nullptr, nullptr},
+        {false, static_cast<ggml_op>(GGML_UNARY_OP_EXP), 0, nullptr, nullptr}
 };
 
 static_assert(ggmlhexagon_k_op_caps[GGML_OP_NONE].supported,    "GGML_OP_NONE is not true");
 static_assert(ggmlhexagon_k_op_caps[GGML_OP_ADD].supported,     "GGML_OP_ADD is not true");
 static_assert(ggmlhexagon_k_op_caps[GGML_OP_MUL].supported,     "GGML_OP_MUL is not true");
 static_assert(ggmlhexagon_k_op_caps[GGML_OP_MUL_MAT].supported, "GGML_OP_MUL_MAT is not true");
-static_assert(std::size(ggmlhexagon_k_op_caps) == (GGML_OP_COUNT + GGML_UNARY_OP_COUNT),
+static_assert(std::size(ggmlhexagon_k_op_caps) == (static_cast<size_t>(GGML_OP_COUNT) + static_cast<size_t>(GGML_UNARY_OP_COUNT)),
               "pls check ggmlhexagon_k_op_caps and ensure is corresponding to latest ggml.h");
 
 static int32_t g_qnntensor_idx = 0; //ensure every QNN tensor name is unique
@@ -735,7 +735,7 @@ static void ggmlhexagon_log_internal(ggml_log_level level, const char * file, co
         int len = vsnprintf(s_ggmlhexagon_log_internal_buf + len_prefix, GGMLHEXAGON_LOGBUF_LEN - len_prefix, format, args);
         if (len < (GGMLHEXAGON_LOGBUF_LEN - len_prefix)) {
 #if (defined __ANDROID__) || (defined ANDROID)
-            __android_log_print(ANDROID_LOG_INFO, "ggml-qnn", "%s\n", s_ggmlhexagon_log_internal_buf);
+            __android_log_print(ANDROID_LOG_INFO, "ggml-hexagon", "%s\n", s_ggmlhexagon_log_internal_buf);
             if (GGML_LOG_LEVEL_INFO == level) {
                 printf("%s\n", s_ggmlhexagon_log_internal_buf);
             }
@@ -883,7 +883,7 @@ static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * c
 
 class hexagon_perf {
 public:
-    hexagon_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {};
+    hexagon_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {}
     hexagon_perf() = delete;
     hexagon_perf(const hexagon_perf & ) = delete;
     hexagon_perf & operator= (const hexagon_perf & ) = delete;
@@ -975,8 +975,8 @@ class hexagon_appcfg {
 private:
     void ltrim(std::string & str) {
         if (str.empty()) return;
-        size_t len = 0;
-        char* temp = (char*)str.c_str();
+        size_t len  = 0;
+        const char * temp = str.c_str();
         while (*temp && isblank(*temp)) {
             ++len;
             ++temp;
@@ -1527,6 +1527,9 @@ static int32_t ggmlqnn_get_idx(int idx_type) {
         default:
             break;
     }
+
+    //it's not make sense, just for fix compiler warning
+    return g_qnntensor_idx;
 }
 
 static intptr_t ggmlqnn_align_to(size_t alignment, intptr_t offset) {
@@ -2731,7 +2734,6 @@ static void ggmlqnn_sdk_logcallback(const char * fmt,
 }
 
 int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
-    BackendIdType backend_id = QNN_BACKEND_ID_NULL;
     GGMLHEXAGON_LOG_DEBUG("enter qni_init\n");
 
     _device_id = HEXAGON_BACKEND_GGML;
@@ -2815,12 +2817,12 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
                 size_t htp_arch                         = (size_t) chipinfo.arch;
                 GGMLHEXAGON_LOG_INFO("htp_type:%d(%s)\n", devinfo->devType,
                              (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "ON_CHIP" : "");
-                soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize };
+                soc_info = { chipinfo.socModel, htp_arch, chipinfo.vtcmSize, {} };
             }
             _qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
         } else {
             GGMLHEXAGON_LOG_WARN("failed to get platform info, are we in emulator?\n");
-            soc_info = { NONE, UNKNOWN_SM, 0 };
+            soc_info = { NONE, UNKNOWN_SM, 0, {} };
         }
 
         QnnHtpDevice_CustomConfig_t soc_customconfig;
@@ -3443,24 +3445,21 @@ static Qnn_Tensor_t * ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn
     }
 
     Qnn_Tensor_t qnn_tensor = {
-            .version= QNN_TENSOR_VERSION_1,
-            {.v1= {
+            .version = QNN_TENSOR_VERSION_1,
+            .v1 = {
                     .id = 0,
                     .name = tensor_name,
                     .type = qnn_tensor_type,
                     .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
                     .dataType = qnn_data_type,
                     .quantizeParams = {.encodingDefinition = QNN_DEFINITION_UNDEFINED,
-                            .quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED,
-                            {.scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}}},
+                            .quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED},
                     .rank = rank,
                     .dimensions = tensor_dims,
                     .memType = QNN_TENSORMEMTYPE_RAW,
                     .clientBuf = {.data = nullptr, .dataSize = 0}
             }
-            }
     };
-
     Qnn_Tensor_t * p_qnn_tensor = (Qnn_Tensor_t *)calloc(1, sizeof(Qnn_Tensor_t));
     if (nullptr == p_qnn_tensor) {
         GGMLHEXAGON_LOG_WARN("calloc failed");
@@ -3488,7 +3487,6 @@ static Qnn_Tensor_t * ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn
 
 static Qnn_Tensor_t * ggmlqnn_create_compute_tensor(qnn_instance * instance, Qnn_GraphHandle_t graph_handle,
                           const ggml_tensor * tensor, Qnn_TensorType_t tensor_type) {
-    Qnn_ErrorHandle_t error = QNN_SUCCESS;
     uint32_t dimensions[]   = {(uint32_t) tensor->ne[0], (uint32_t) tensor->ne[1],
                                (uint32_t) tensor->ne[2], (uint32_t) tensor->ne[3]};
     Qnn_DataType_t qnn_data_type = QNN_DATATYPE_FLOAT_32;
@@ -3670,7 +3668,6 @@ static void ggmlqnn_compute_elementwise(ggml_backend_hexagon_context * ctx, ggml
  */
 static void ggmlqnn_compute_mul_mat_4d(ggml_backend_hexagon_context * ctx, ggml_tensor * op) {
     Qnn_ErrorHandle_t error     = QNN_SUCCESS;
-    bool graph_initialized      = false;
     qnn_instance * instance     = ctx->instance;
     QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
 
@@ -3701,7 +3698,6 @@ static void ggmlqnn_compute_mul_mat_4d(ggml_backend_hexagon_context * ctx, ggml_
     Qnn_Tensor_t * p_reshape2_out   = nullptr;
 
     if (ctx->qnn_singlenode_graph_map.find(graph_name) != ctx->qnn_singlenode_graph_map.end()) {
-        graph_initialized = true;
         qnn_singlenode_res_t & graph_item   = ctx->qnn_singlenode_graph_map[graph_name];
         graph_handle                        = std::get<0>(graph_item);
         qnn_ptensors_t & tensors            = std::get<1>(graph_item);
@@ -3759,7 +3755,7 @@ static void ggmlqnn_compute_mul_mat_4d(ggml_backend_hexagon_context * ctx, ggml_
                                                                         QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32, 1,
                                                                         tile_dims, tile_multiples, sizeof(tile_multiples));
 
-        Qnn_Param_t tile_params[]       = {{QNN_PARAMTYPE_TENSOR, "multiples", .tensorParam = *p_tile_multiples}};
+        Qnn_Param_t tile_params[]       = {{.paramType = QNN_PARAMTYPE_TENSOR, .name = "multiples", .tensorParam = *p_tile_multiples}};
         Qnn_Tensor_t tile0_inputs[]     = {*p_reshape0_out};
         Qnn_Tensor_t tile0_outputs[]    = {*p_tile0_out};
         Qnn_OpConfig_t tile0_op         = ggmlqnn_create_op_config("tile0", QNN_OP_PACKAGE_NAME_QTI_AISW,
@@ -3790,7 +3786,7 @@ static void ggmlqnn_compute_mul_mat_4d(ggml_backend_hexagon_context * ctx, ggml_
                                                        QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 4,
                                                        permute1_out_dims, nullptr, 0);
 
-        Qnn_Param_t permute1_params[]   = {{QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_perm}};
+        Qnn_Param_t permute1_params[]   = {{.paramType = QNN_PARAMTYPE_TENSOR, .name = "perm", .tensorParam = *p_perm}};
         Qnn_Tensor_t permute1_inputs[]  = {*p_tensor1};
         Qnn_Tensor_t permute1_outputs[] = {*p_permute1_out};
         Qnn_OpConfig_t permute1_op      = ggmlqnn_create_op_config("permute1", QNN_OP_PACKAGE_NAME_QTI_AISW,
@@ -3867,7 +3863,7 @@ static void ggmlqnn_compute_mul_mat_4d(ggml_backend_hexagon_context * ctx, ggml_
  *        `src1` and the weight tensor `src0`, handling transposing, and quantization as needed,
  *        and stores the result in the destination tensor `dst`.
  *
-         there are two key-points in properly handling how to offload mulmat to the QNN backend in ggml-qnn
+         there are two key-points in properly handling how to offload mulmat to the QNN
          1. transpose
             a 3x2 f32 matrix which means 3 rows and 2 columns. in ggml, it could be created from:
             struct ggml_tensor* matrix = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
@@ -4002,8 +3998,8 @@ static void ggmlqnn_compute_mul_mat(ggml_backend_hexagon_context * ctx, ggml_ten
 
         //compose QNN graph: add mulmat node
         Qnn_Param_t out_0_params[] = {
-                {QNN_PARAMTYPE_SCALAR, QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, .scalarParam = {
-                        QNN_DATATYPE_BOOL_8, .bool8Value = 1}}};
+                {.paramType = QNN_PARAMTYPE_SCALAR, .name = QNN_OP_MAT_MUL_PARAM_TRANSPOSE_IN1, .scalarParam = {
+                        .dataType = QNN_DATATYPE_BOOL_8, .bool8Value = 1}}};
         Qnn_Tensor_t out_0_inputs[] = {*p_tensor0, *p_tensor1};
         Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
         Qnn_OpConfig_t out_0 = ggmlqnn_create_op_config("mulmat_opconfig",
@@ -4014,7 +4010,7 @@ static void ggmlqnn_compute_mul_mat(ggml_backend_hexagon_context * ctx, ggml_ten
 
         //compose QNN graph: add transpose node
         Qnn_Param_t out_trans1_0_params[] = {
-                {QNN_PARAMTYPE_TENSOR, "perm", .tensorParam = *p_param_tensor}};
+                {.paramType = QNN_PARAMTYPE_TENSOR, .name = "perm", .tensorParam = *p_param_tensor}};
         Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
         Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
         Qnn_OpConfig_t out_trans1_0 = ggmlqnn_create_op_config("mulmat_transpose_opconfig",
@@ -4235,7 +4231,7 @@ static int ggmlhexagon_pd_status_notifier_callback(void * context, int domain, i
 static domain * ggmlhexagon_get_domain(int domain_id) {
     int size = sizeof(hexagon_supported_domains) / sizeof(domain);
 
-    for (size_t i = 0; i < size; i++) {
+    for (int i = 0; i < size; i++) {
         if (hexagon_supported_domains[i].id == domain_id)
             return &hexagon_supported_domains[i];
     }
@@ -4250,11 +4246,11 @@ static bool ggmlhexagon_is_cdsp(int domain_id) {
 static bool ggmlhexagon_is_valid_domain_id(int domain_id, int compute_only) {
     int size = sizeof(hexagon_supported_domains) / sizeof(domain);
 
-    if (compute_only) {
+    if (0 != compute_only) {
         return ggmlhexagon_is_cdsp(domain_id);
     }
 
-    for (size_t i = 0; i < size; i++) {
+    for (int i = 0; i < size; i++) {
         if (hexagon_supported_domains[i].id == domain_id)
             return true;
     }
@@ -4265,6 +4261,7 @@ static bool ggmlhexagon_is_valid_domain_id(int domain_id, int compute_only) {
 static int ggmlhexagon_get_domains_info(const char * domain_type, int * num_domains, fastrpc_domain ** domains_info) {
     int hexagon_err = AEE_SUCCESS;
     int ss_info     = 0;
+    void * buffer   = nullptr;
     ss_info = strcmp(domain_type, "NSP")? HPASS: NSP;
     system_req_payload req;
     memset(&req, 0, sizeof(system_req_payload));
@@ -4283,44 +4280,39 @@ static int ggmlhexagon_get_domains_info(const char * domain_type, int * num_doma
     goto bail;
 #endif
 
-    if (remote_system_request) {
-        hexagon_err = remote_system_request(&req);
-        if (hexagon_err != AEE_SUCCESS) {
-            GGMLHEXAGON_LOG_DEBUG("failure in remote_system_request call: %d", hexagon_err);
-            goto bail;
-        }
-        //allocate memory for domain-info array
-        req.sys.max_domains = req.sys.num_domains;
-        void * buffer = calloc(req.sys.num_domains, sizeof(fastrpc_domain));
-        if (nullptr == buffer) {
-            hexagon_err = AEE_ENOMEMORY;
-            GGMLHEXAGON_LOG_DEBUG("unable to allocate memory for req.sys.domains");
-            goto bail;
-        }
-        req.sys.domains = static_cast<fastrpc_domain *>(buffer);
-        hexagon_err = remote_system_request(&req);
-        if (hexagon_err != AEE_SUCCESS) {
-            GGMLHEXAGON_LOG_DEBUG("failure in remote_system_request call: %d.\n", hexagon_err);
-            goto bail;
-        }
+    hexagon_err = remote_system_request(&req);
+    if (hexagon_err != AEE_SUCCESS) {
+        GGMLHEXAGON_LOG_DEBUG("failure in remote_system_request call: %d", hexagon_err);
+        goto bail;
+    }
+    //allocate memory for domain-info array
+    req.sys.max_domains = req.sys.num_domains;
+    buffer = calloc(req.sys.num_domains, sizeof(fastrpc_domain));
+    if (nullptr == buffer) {
+        hexagon_err = AEE_ENOMEMORY;
+        GGMLHEXAGON_LOG_DEBUG("unable to allocate memory for req.sys.domains");
+        goto bail;
+    }
+    req.sys.domains = static_cast<fastrpc_domain *>(buffer);
+    hexagon_err = remote_system_request(&req);
+    if (hexagon_err != AEE_SUCCESS) {
+        GGMLHEXAGON_LOG_DEBUG("failure in remote_system_request call: %d.\n", hexagon_err);
+        goto bail;
+    }
 
-        for (int i = 0; i < req.sys.num_domains; i++) {
-            //verify that only requested type domains were returned
-            domain = &req.sys.domains[i];
-            if (domain->type != ss_info) {
-                hexagon_err = -1;
-                GGMLHEXAGON_LOG_DEBUG("incorrect data received from remote_system_request.\n");
-                goto bail;
-            }
+    for (int i = 0; i < req.sys.num_domains; i++) {
+        //verify that only requested type domains were returned
+        domain = &req.sys.domains[i];
+        if (domain->type != ss_info) {
+            hexagon_err = -1;
+            GGMLHEXAGON_LOG_DEBUG("incorrect data received from remote_system_request.\n");
+            goto bail;
         }
-        *domains_info = req.sys.domains;
-        *num_domains  = req.sys.num_domains;
-    } else {
-        hexagon_err = AEE_EUNSUPPORTED;
-        goto bail;
     }
+    *domains_info = req.sys.domains;
+    *num_domains  = req.sys.num_domains;
 
-    bail:
+bail:
     if (hexagon_err && !req.sys.domains) {
         free(req.sys.domains);
     }
@@ -4358,7 +4350,7 @@ static int ggmlhexagon_get_dsp_support(int * domain) {
         GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
     }
 
-    bail:
+bail:
     return hexagon_error;
 }
 
@@ -4405,7 +4397,7 @@ static int ggmlhexagon_get_vtcm_info(int domain, uint32_t attr, uint32_t * capab
         GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
     }
 
-    bail:
+bail:
     return hexagon_error;
 }
 
@@ -4475,7 +4467,7 @@ static bool ggmlhexagon_is_async_fastrpc_supported(int domain) {
         GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device");
     }
 
-    bail:
+bail:
     return false;
 }
 
@@ -4505,7 +4497,7 @@ static void ggmlhexagon_set_rpc_latency(int domain, int qos, int latency) {
         GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device");
     }
 
-    bail:
+bail:
     return;
 }
 
@@ -4539,7 +4531,7 @@ static bool ggmlhexagon_is_status_notification_supported(int domain) {
         GGMLHEXAGON_LOG_WARN("remote_dsp_capability interface is not supported on this device");
     }
 
-    bail:
+bail:
     return false;
 }
 
@@ -4585,7 +4577,7 @@ static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t attr, uint32_t
         GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
     }
 
-    bail:
+bail:
     return hexagon_error;
 }
 
@@ -4616,7 +4608,7 @@ static int ggmlhexagon_get_hvx_arch_ver(int domain, uint32_t * capability) {
         GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
     }
 
-    bail:
+bail:
     return hexagon_error;
 }
 
@@ -4667,7 +4659,7 @@ static int ggmlhexagon_get_hvx_support_info(int domain, uint32_t attr, uint32_t
         GGMLHEXAGON_LOG_DEBUG("remote_dsp_capability interface is not supported on this device");
     }
 
-    bail:
+bail:
     return hexagon_error;
 }
 
@@ -4764,7 +4756,6 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     int use_logical_id              = 0;
     int core_id                     = -1;
     fastrpc_domain * domains_info   = NULL;
-    fastrpc_domain * domain_info    = NULL;
     int num_domains                 = -1;
 
     domain * my_domain              = NULL;
@@ -4782,7 +4773,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     ctx->rpc_mempool = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, ctx->rpc_mempool_len);
     if (nullptr == ctx->rpc_mempool) {
         hexagon_error = AEE_ENORPCMEMORY;
-        printf("rpc memory alloc failed", hexagon_error);
+        GGMLHEXAGON_LOG_WARN("rpc memory alloc failed %d", hexagon_error);
         ctx->rpc_mempool_len = 0;
         return 2;
     }
@@ -4908,7 +4899,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
         rpcmem_free(ctx->rpc_mempool);
         ctx->rpc_mempool        = nullptr;
         ctx->rpc_mempool_len    = 0;
-        ctx->ggmlop_handle      = -1;
+        ctx->ggmlop_handle      = 0;
         ctx->domain_id          = -1;
     }
 
@@ -4918,12 +4909,12 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
 static void ggmlhexagon_close_cdsp(ggml_backend_hexagon_context * ctx) {
     int hexagon_error  = AEE_SUCCESS;
     GGMLHEXAGON_LOG_INFO("enter %s", __func__);
-    if (-1 != ctx->ggmlop_handle) {
+    if (0 != ctx->ggmlop_handle) {
         hexagon_error = ggmlop_dsp_close(ctx->ggmlop_handle);
         if (AEE_SUCCESS != hexagon_error) {
             GGMLHEXAGON_LOG_WARN("error 0x%x: failed to close ggmlop dsp handle", hexagon_error);
         } else {
-            ctx->ggmlop_handle = -1;
+            ctx->ggmlop_handle = 0;
         }
     }
 
@@ -5031,16 +5022,8 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
 
     const struct ggml_tensor * src0 = op_tensor->src[0];
     const struct ggml_tensor * src1 = op_tensor->src[1];
-    int64_t ne00        = 0;
-    uint32_t src0_rank  = 0;
-    uint32_t src1_rank  = 0;
-    if (nullptr != src0) {
-        src0_rank = ggml_n_dims(src0);
-        ne00      = src0->ne[0];
-    }
-    if (nullptr != src1) {
-        src1_rank = ggml_n_dims(src1);
-    }
+    const int64_t ne00        = src0->ne[0];
+    const uint32_t src0_rank  = ggml_n_dims(src0);
 
     switch (op_tensor->op) {
         case GGML_OP_ADD:
@@ -5689,7 +5672,7 @@ bool ggml_backend_is_hexagon(ggml_backend_t backend) {
     return backend != nullptr && ggml_guid_matches(backend->guid, ggml_backend_hexagon_guid());
 }
 
-void ggml_backend_hexagon_set_n_threads(ggml_backend_t backend, int n_threads) {
+static void ggml_backend_hexagon_set_n_threads(ggml_backend_t backend, int n_threads) {
     GGML_ASSERT(ggml_backend_is_hexagon(backend));
 
     struct ggml_backend_hexagon_context * ctx = (struct ggml_backend_hexagon_context *)backend->context;
@@ -5858,8 +5841,6 @@ static qnn_instance * ggmlqnn_init_qnn_instance(size_t device, const char * qnn_
  * @return
  */
 ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * qnn_lib_path) {
-    int result = 0;
-
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
     //case-3: calling ggml_backend_hexagon_init() directly in user's code
     ggmlhexagon_load_cfg();
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
index 45f6a9b86e426..ce5f0ae383fb2 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
+++ b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
@@ -403,11 +403,9 @@ static __inline int _stub_method_1(remote_handle64 _handle, uint32_t _mid, uintp
    _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[11]), (char**)&(((uint64_t*)_in1)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[12]), (uint32_t*)&(((uint32_t*)_in1)[14])));
    _count(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[11]), (char**)&(((uint64_t*)_rout2)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[12]), (uint32_t*)&(((uint32_t*)_rout2)[14])));
    if(_numIn[0]>=255){
-          _QAIC_FARF(RUNTIME_ERROR, "ERROR: Unsupported number of input buffers\n");
           return AEE_EUNSUPPORTED;
    }
    if(_numROut[0]>=255){
-          _QAIC_FARF(RUNTIME_ERROR, "ERROR: Unsupported number of output buffers\n");
           return AEE_EUNSUPPORTED;
    }
    _allocator_init(_al, 0, 0);
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h
index 660bab4a15e70..e30f833f06d17 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h
+++ b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h
@@ -180,14 +180,7 @@ static __inline void _qaic_memmove(void* dst, void* src, int size) {
 #ifdef _WIN32
 #define _QAIC_FARF(level, msg, ...) (void)0
 #else
-#define _QAIC_FARF(level, msg, ...) \
-   do {\
-      if(0 == (HAP_debug_v2) ) {\
-         (void)0; \
-      } else { \
-         FARF(level, msg , ##__VA_ARGS__); \
-      } \
-   }while(0)
+#define _QAIC_FARF(level, msg, ...) (void)0
 #endif //_WIN32 for _QAIC_FARF
 
 #define _TRY(ee, func) \
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
index a526d44e2260c..355852b394dfc 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
@@ -894,13 +894,13 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
     return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
 }
 
-inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+static inline float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
     uint16_t s;
     memcpy(&s, &f, sizeof(uint16_t));
     return ggml_table_f32_f16[s];
 }
 
-static void ggml_init() {
+static inline void ggml_init(void) {
     for (int i = 0; i < (1 << 16); ++i) {
         union {
             uint16_t u16;
@@ -1609,7 +1609,6 @@ static void ggml_compute_forward_div_f32(
 }
 
 int ggmlop_dsp_div(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-
     switch (src0->type) {
         case GGML_TYPE_F32:
         {
@@ -1621,6 +1620,8 @@ int ggmlop_dsp_div(remote_handle64 h, const ggml_tensor * src0, const ggml_tenso
             GGML_ABORT("fatal error");
         }
     }
+
+    return 0;
 }
 
 static void ggml_compute_forward_mul_mat_one_chunk(

From 57d3322b40460e154d188380b83ad47e29924016 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Mon, 31 Mar 2025 10:47:34 +0800
Subject: [PATCH 148/200] ggml-hexagon: enable only one backend device for
 HWACCEL_CDSP and enable rpc ion memory pool for HWACCEL_CDSP

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 342 ++++++++++++++++---------
 scripts/ggml-hexagon.cfg               |   2 +
 2 files changed, 220 insertions(+), 124 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index c460f0de66db8..525d8d928a96e 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -151,6 +151,7 @@ struct ggml_backend_hexagon_context;
 #define QNN_VER_PTR(x)                                  (&((x).v1))
 #define RPCMEM_DEFAULT_FLAGS                            1
 #define RPCMEM_HEAP_ID_SYSTEM                           25
+#define SIZE_IN_MB                                      (1 << 20)
 #define STATUS_CONTEXT                                  0x12345678
 
 #define CHECK_QNN_API(error, result)                                            \
@@ -277,32 +278,15 @@ struct ggml_backend_hexagon_context {
     int n_threads;
 
     //Hexagon resource management for the general approach through Hexagaon cDSP
+    size_t rpc_mempool_capacity;
     size_t rpc_mempool_len;
+    size_t rpc_mempool_usage;
     void * rpc_mempool;
+    int rpc_mempool_handle;
     remote_handle64 ggmlop_handle;
     int domain_id;
 };
 
-struct ggml_backend_hexagon_buffer_context {
-    ~ggml_backend_hexagon_buffer_context() {
-        if (buffer) {
-            ggml_aligned_free(buffer, 0);
-        }
-
-        for (auto * sub_buffer : sub_buffers) {
-            free(sub_buffer);
-        }
-
-        sub_buffers.clear();
-    }
-    void * buffer       = nullptr;
-
-    struct ggml_backend_hexagon_context * backend_ctx = nullptr;
-
-    size_t buffer_size  = 0;
-    std::vector<void *> sub_buffers;
-};
-
 struct qnn_op_caps {
     bool supported;
     ggml_op op;
@@ -331,6 +315,7 @@ struct hexagon_appcfg_t {
     int hexagon_backend;        // 0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP
     int enable_mulmat_cdsp;     // enable/disable offload mulmat to cDSP
     int enable_q_mulmat;        // enable/disable offload fp32 & quantized mulmat to cDSP
+    int enable_rpc_ion_mempool; // enable/disable rpc ion memory pool
     const char * cfgfilename;
     const char * runtimelib_path;
 };
@@ -348,6 +333,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
         .hexagon_backend        = HEXAGON_BACKEND_CDSP,
         .enable_mulmat_cdsp     = 0,
         .enable_q_mulmat        = 0,
+        .enable_rpc_ion_mempool = 0,
         .cfgfilename            = "ggml-hexagon.cfg",
 #if defined(__ANDROID__)
 //Android command line program
@@ -873,8 +859,9 @@ static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * c
                          ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend));
     ggmlhexagon_get_timestring(timestamp);
     if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
-        GGMLHEXAGON_LOG_INFO("only offload GGML_OP_ADD: %s", g_hexagon_appcfg.enable_mulmat_cdsp ? "NO" : "YES");
+        GGMLHEXAGON_LOG_INFO("offload GGML_OP_MULMAT: %s", g_hexagon_appcfg.enable_mulmat_cdsp ? "YES" : "NO");
         GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
+        GGMLHEXAGON_LOG_INFO("using rpc ion memory pool: %s", g_hexagon_appcfg.enable_rpc_ion_mempool ? "YES" : "NO");
     } else {
         GGMLHEXAGON_LOG_INFO("only offload GGML_OP_ADD: NO");
     }
@@ -1451,17 +1438,18 @@ static void ggmlhexagon_load_cfg() {
     });
     std::string precision_mode;
     qnncfg_instance.get_intvalue("general", "print_qnn_internal_log", g_hexagon_appcfg.print_qnn_internal_log, 0);
-    qnncfg_instance.get_intvalue("general", "enable_perf", g_hexagon_appcfg.enable_perf, 0);
+    qnncfg_instance.get_intvalue("general", "enable_perf", g_hexagon_appcfg.enable_perf, 1);
     qnncfg_instance.get_intvalue("general", "print_tensors_info", g_hexagon_appcfg.print_tensors_info, 0);
     qnncfg_instance.get_intvalue("general", "dump_op_info", g_hexagon_appcfg.dump_op_info, 0);
-    qnncfg_instance.get_intvalue("general", "hwaccel_approach", g_hexagon_appcfg.hwaccel_approach, 0);
-    qnncfg_instance.get_intvalue("general", "hexagon_backend", g_hexagon_appcfg.hexagon_backend, 2);
+    qnncfg_instance.get_intvalue("general", "hwaccel_approach", g_hexagon_appcfg.hwaccel_approach, HWACCEL_CDSP);
+    qnncfg_instance.get_intvalue("general", "hexagon_backend", g_hexagon_appcfg.hexagon_backend, HEXAGON_BACKEND_CDSP);
     qnncfg_instance.get_intvalue("qnn", "hvx_threads", g_hexagon_appcfg.hvx_threads, 4);
     qnncfg_instance.get_intvalue("qnn", "vtcm_size_in_mb", g_hexagon_appcfg.vtcm_size_in_mb, 8);
-    qnncfg_instance.get_intvalue("qnn", "enable_dlbc", g_hexagon_appcfg.enable_dlbc, 0);
+    qnncfg_instance.get_intvalue("qnn", "enable_dlbc", g_hexagon_appcfg.enable_dlbc, 1);
     qnncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32");
-    qnncfg_instance.get_intvalue("cdsp", "enable_mulmat_cdsp", g_hexagon_appcfg.enable_mulmat_cdsp, 0);
+    qnncfg_instance.get_intvalue("cdsp", "enable_mulmat_cdsp", g_hexagon_appcfg.enable_mulmat_cdsp, 1);
     qnncfg_instance.get_intvalue("cdsp", "enable_q_mulmat", g_hexagon_appcfg.enable_q_mulmat, 0);
+    qnncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 1);
     GGMLHEXAGON_LOG_INFO("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
                          ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
     GGMLHEXAGON_LOG_INFO("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend,
@@ -3170,7 +3158,6 @@ int qnn_instance::htp_init_perfinfra() {
 void qnn_instance::htp_probe_rpc_meminfo() {
     size_t candidate_size   = 0;
     uint8_t * rpc_buffer    = nullptr;
-    const int SIZE_IN_MB    = (1 << 20);
     size_t probe_slots[]    = {1024, 1536, 2048 - 48, 2048};
     size_t probe_counts     = sizeof(probe_slots) / sizeof(size_t);
     for (size_t idx = 0; idx < probe_counts; idx++) {
@@ -4479,7 +4466,7 @@ static void ggmlhexagon_set_rpc_latency(int domain, int qos, int latency) {
 /*
         qos          |  latency
         -----------------------
-        RPC_PM_QOS   |  300
+        RPC_PM_QOS   |  100
         RPC_POLL_QOS |  1000
 */
         data.enable   = qos;
@@ -4685,11 +4672,10 @@ static int ggmlhexagon_request_status_notifications(int domain_id, void * contex
     return hexagon_error;
 }
 
-static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx, size_t * rpcmem_capacity) {
+static void ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
     size_t candidate_size   = 0;
     uint8_t * rpc_buffer    = nullptr;
-    const int SIZE_IN_MB    = (1 << 20);
-    size_t probe_slots[]    = {1024, 1536, 2048 - 48, 2048};
+    size_t probe_slots[]    = {1024, 1536, 2000, 2048};
     size_t probe_counts     = sizeof(probe_slots) / sizeof(size_t);
     for (size_t idx = 0; idx < probe_counts; idx++) {
         rpc_buffer = static_cast<uint8_t *>(rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB)));
@@ -4702,10 +4688,52 @@ static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx, size_t
             rpc_buffer = nullptr;
         }
     }
+    ctx->rpc_mempool_capacity = candidate_size * SIZE_IN_MB;
+    GGMLHEXAGON_LOG_DEBUG("rpc memory capacity %ld(%d M) for device %d",
+                          ctx->rpc_mempool_capacity, ctx->rpc_mempool_capacity / SIZE_IN_MB, ctx->device);
+    GGMLHEXAGON_LOG_INFO("capacity of rpc memory %d MB", ctx->rpc_mempool_capacity / SIZE_IN_MB);
+
+    if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        //FIXME: reasonable rpc memory pool size
+        ctx->rpc_mempool_len = 1024 * SIZE_IN_MB;
+        if (ctx->rpc_mempool_len > ctx->rpc_mempool_capacity) {
+            GGMLHEXAGON_LOG_WARN("rpc mempool is too big");
+            return;
+        }
+        //FIXME: use ion memory pool currently, it seems there is unknown bug with DMA memory pool
+        ctx->rpc_mempool = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS,
+                                        ctx->rpc_mempool_len);
+        if (nullptr == ctx->rpc_mempool) {
+            GGMLHEXAGON_LOG_WARN("alloc rpc memorypool %d failed", ctx->rpc_mempool_len);
+            return;
+        } else {
+            GGMLHEXAGON_LOG_DEBUG("alloc rpc memorypool %p successfully %ld(%d M)",
+                                  ctx->rpc_mempool, ctx->rpc_mempool_len,
+                                  ctx->rpc_mempool_len / SIZE_IN_MB);
+        }
+        ctx->rpc_mempool_handle = rpcmem_to_fd(ctx->rpc_mempool);
+        GGMLHEXAGON_LOG_DEBUG("rpc mempool handle %d", ctx->rpc_mempool_handle);
+        remote_register_buf(ctx->rpc_mempool, ctx->rpc_mempool_len, ctx->rpc_mempool_handle);
+    }
+
+    return;
+}
 
-    *rpcmem_capacity = candidate_size;
-    GGMLHEXAGON_LOG_INFO("capacity of rpc ion memory %d MB", *rpcmem_capacity);
+static void ggmlhexagon_deinit_rpcmempool(ggml_backend_hexagon_context * ctx) {
+    if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        if (ctx->rpc_mempool) {
+            //deregister rpc memory pool
+            remote_register_buf(ctx->rpc_mempool, ctx->rpc_mempool_len, -1);
+            GGMLHEXAGON_LOG_DEBUG("free rpc mempool %p", ctx->rpc_mempool);
+            rpcmem_free(ctx->rpc_mempool);
+            ctx->rpc_mempool = nullptr;
+            ctx->rpc_mempool_len = 0;
+            ctx->rpc_mempool_capacity = 0;
+        }
+    }
+}
 
+static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx) {
     uint32_t dsp_version = 0;
     ggmlhexagon_get_hvx_arch_ver(ctx->domain_id, &dsp_version);
 
@@ -4727,22 +4755,40 @@ static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx, size_t
     uint32_t vtcm_page  = 0;
     ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_COUNT, &vtcm_count);
     ggmlhexagon_get_vtcm_info(ctx->domain_id, VTCM_PAGE, &vtcm_page);
-    GGMLHEXAGON_LOG_DEBUG("vtcm_count %d", vtcm_count);
-    GGMLHEXAGON_LOG_DEBUG("vtcm_page %d", vtcm_page);
+    GGMLHEXAGON_LOG_INFO("vtcm_count %d", vtcm_count);
+    GGMLHEXAGON_LOG_INFO("vtcm_page %d", vtcm_page);
 
     uint32_t hmx_depth = 0;
     uint32_t hmx_spatial = 0;
     ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_DEPTH, &hmx_depth);
     ggmlhexagon_get_hmx_support_info(ctx->domain_id, HMX_SUPPORT_SPATIAL, &hmx_spatial);
-    GGMLHEXAGON_LOG_DEBUG("hmx_depth %d", hmx_depth);
-    GGMLHEXAGON_LOG_DEBUG("hmx_spatial %d", hmx_spatial);
+    GGMLHEXAGON_LOG_INFO("hmx_depth %d", hmx_depth);
+    GGMLHEXAGON_LOG_INFO("hmx_spatial %d", hmx_spatial);
 
     uint32_t hvx_support_128b = 0;
     ggmlhexagon_get_hvx_support_info(ctx->domain_id, HVX_SUPPORT_128B, &hvx_support_128b);
-    GGMLHEXAGON_LOG_DEBUG("hvx_support_128b %d", hvx_support_128b);
+    GGMLHEXAGON_LOG_INFO("hvx_support_128b %d", hvx_support_128b);
+
+    GGMLHEXAGON_LOG_INFO("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support());
+    GGMLHEXAGON_LOG_INFO("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id));
+}
+
+static void ggmlhexagon_deinit_cdsp(ggml_backend_hexagon_context * ctx) {
+    int hexagon_error  = AEE_SUCCESS;
+    GGMLHEXAGON_LOG_INFO("enter %s", __func__);
+    if (0 != ctx->ggmlop_handle) {
+        hexagon_error = ggmlop_dsp_close(ctx->ggmlop_handle);
+        if (AEE_SUCCESS != hexagon_error) {
+            GGMLHEXAGON_LOG_WARN("error 0x%x: failed to close ggmlop dsp handle", hexagon_error);
+        } else {
+            ctx->ggmlop_handle = 0;
+        }
+    }
 
-    GGMLHEXAGON_LOG_DEBUG("unsigned pd supported %d", ggmlhexagon_get_unsignedpd_support());
-    GGMLHEXAGON_LOG_DEBUG("async fastrpc supported %d", ggmlhexagon_is_async_fastrpc_supported(ctx->domain_id));
+    ggmlhexagon_deinit_rpcmempool(ctx);
+
+    ctx->domain_id                = -1;
+    GGMLHEXAGON_LOG_INFO("leave %s", __func__);
 }
 
 static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
@@ -4767,16 +4813,11 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     if (nullptr == ctx)
         return 1;
     GGMLHEXAGON_LOG_INFO("init Hexagon DSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
-    //TODO: reasonable rpc memory pool size and use it practically
-    ctx->ggmlop_handle = -1;
-    ctx->rpc_mempool_len  = (1 << 20) * 512;
-    ctx->rpc_mempool = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, ctx->rpc_mempool_len);
-    if (nullptr == ctx->rpc_mempool) {
-        hexagon_error = AEE_ENORPCMEMORY;
-        GGMLHEXAGON_LOG_WARN("rpc memory alloc failed %d", hexagon_error);
-        ctx->rpc_mempool_len = 0;
-        return 2;
+    if (nullptr != ctx->rpc_mempool) {
+        GGMLHEXAGON_LOG_INFO("already init Hexagon DSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
+        return 0;
     }
+    ctx->ggmlop_handle = -1;
 
     if (-1 == domain_id) {
         if (nullptr != domain_type) {
@@ -4877,17 +4918,20 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     hexagon_error = ggmlop_dsp_open(ggmlop_domain_uri, &ctx->ggmlop_handle);
     if (AEE_SUCCESS == hexagon_error) {
         GGMLHEXAGON_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
-        GGMLHEXAGON_LOG_INFO("only support GGML_OP_ADD and GGML_OP_MUL_MAT on cDSP currently");
-        ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_V2_DUTY_CYCLE_MODE, 40, 1);
-        size_t rpcmem_size = 0;
-        ggmlhexagon_probe_dspinfo(ctx, &rpcmem_size);
-        ggmlhexagon_set_rpc_latency(domain_id, RPC_POLL_QOS, 1000);
+        GGMLHEXAGON_LOG_INFO("only support offload GGML_OP_ADD and GGML_OP_MUL_MAT to cDSP currently");
+        ggmlhexagon_probe_dspinfo(ctx);
+        ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1);
+        ggmlhexagon_set_rpc_latency(domain_id, RPC_PM_QOS, 100);
+        ggmlhexagon_init_rpcmempool(ctx);
     } else {
         GGMLHEXAGON_LOG_INFO("error 0x%x: failed to open domain %d(%s)", hexagon_error, domain_id,
                              ggmlhexagon_get_dsp_name(domain_id));
         goto bail;
     }
 
+    //ensure test-backend-ops get the correct backend name when hwaccel approach is 2(HWACCEL_CDSP)
+    memcpy(g_hexagon_mgr[ctx->device].name, "Hexagon-cDSP", strlen("Hexagon-cDSP"));
+
     return 0;
 
 bail:
@@ -4895,38 +4939,11 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
         free(ggmlop_domain_uri);
     }
 
-    if (ctx->rpc_mempool) {
-        rpcmem_free(ctx->rpc_mempool);
-        ctx->rpc_mempool        = nullptr;
-        ctx->rpc_mempool_len    = 0;
-        ctx->ggmlop_handle      = 0;
-        ctx->domain_id          = -1;
-    }
+    ggmlhexagon_deinit_cdsp(ctx);
 
     return -1;
 }
 
-static void ggmlhexagon_close_cdsp(ggml_backend_hexagon_context * ctx) {
-    int hexagon_error  = AEE_SUCCESS;
-    GGMLHEXAGON_LOG_INFO("enter %s", __func__);
-    if (0 != ctx->ggmlop_handle) {
-        hexagon_error = ggmlop_dsp_close(ctx->ggmlop_handle);
-        if (AEE_SUCCESS != hexagon_error) {
-            GGMLHEXAGON_LOG_WARN("error 0x%x: failed to close ggmlop dsp handle", hexagon_error);
-        } else {
-            ctx->ggmlop_handle = 0;
-        }
-    }
-
-    if (ctx->rpc_mempool) {
-        rpcmem_free(ctx->rpc_mempool);
-        ctx->rpc_mempool        = nullptr;
-        ctx->rpc_mempool_len    = 0;
-        ctx->domain_id          = -1;
-    }
-    GGMLHEXAGON_LOG_INFO("leave %s", __func__);
-}
-
 static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_tensor * op) {
     //skip sanity check because already checked in other place
     struct dsptensor dsptensor_0;
@@ -5024,7 +5041,7 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
     const struct ggml_tensor * src1 = op_tensor->src[1];
     const int64_t ne00        = src0->ne[0];
     const uint32_t src0_rank  = ggml_n_dims(src0);
-
+    const uint32_t src1_rank  = ggml_n_dims(src1);
     switch (op_tensor->op) {
         case GGML_OP_ADD:
         case GGML_OP_SUB:
@@ -5033,22 +5050,12 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
                 return false;
             }
 
-            //FIXME:remove this filter
-            if (ne00 < 32)
-                return false;
-
-            //FIXME:remove this filter
+            //TODO: offload quantize GGML_OP_ADD to cDSP
             return ggmlhexagon_same_types(ctx, op_tensor);
         }
         case GGML_OP_MUL_MAT:
         {
             ggmlhexagon_dump_op_info(op_tensor);
-
-            //TODO:3d&4d matrix mulmat on cDSP
-            if (src0_rank != 2)
-                return false;
-
-            ggmlhexagon_dump_op_info(op_tensor);
             if (g_hexagon_appcfg.enable_q_mulmat)
                 return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q6_K
                        ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
@@ -5282,6 +5289,32 @@ static bool ggmlhexagon_compute_forward(ggml_backend_t backend, struct ggml_tens
     return true;
 }
 
+struct ggml_backend_hexagon_buffer_context {
+    ~ggml_backend_hexagon_buffer_context() {
+        GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
+        if (buffer) {
+            if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+                GGMLHEXAGON_LOG_DEBUG("rpcmem %p, size %d", buffer, buffer_size);
+                //do nonthing here because rpc mempool was used for HWACCEL_CDSP
+            } else {
+                ggml_aligned_free(buffer, 0);
+            }
+        }
+
+        for (auto * sub_buffer : sub_buffers) {
+            free(sub_buffer);
+        }
+
+        sub_buffers.clear();
+    }
+    void * buffer       = nullptr;
+
+    struct ggml_backend_hexagon_context * backend_ctx = nullptr;
+
+    size_t buffer_size  = 0;
+    std::vector<void *> sub_buffers;
+};
+
 static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) {
     ggml_backend_hexagon_buffer_context * ctx = (ggml_backend_hexagon_buffer_context *)buffer->context;
     delete ctx;
@@ -5357,7 +5390,12 @@ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_ty
 
 static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
            ggml_backend_buffer_type_t buft, size_t size) {
-    ggml_backend_hexagon_buffer_context * ctx = new ggml_backend_hexagon_buffer_context;
+    GGMLHEXAGON_LOG_DEBUG("enter %s, size %d", __func__, size);
+    struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(buft->context);
+    GGML_ASSERT(nullptr != ctx);
+    GGMLHEXAGON_LOG_DEBUG("device %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
+
+    ggml_backend_hexagon_buffer_context * buffer_ctx = new ggml_backend_hexagon_buffer_context;
 
     size_t size_page = 0;
 #if defined(__ANDROID__) || defined(__linux__)
@@ -5368,17 +5406,31 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
     size_page = systeminfo.dwPageSize;
 #endif
     size_t size_aligned = size;
+    GGMLHEXAGON_LOG_DEBUG("size_aligned %d", size_aligned);
     if ((size_aligned % size_page) != 0) {
         size_aligned += (size_page - (size_aligned % size_page));
     }
-    ctx->buffer         = ggml_aligned_malloc(size_aligned);
-    ctx->buffer_size    = size_aligned;
-    if (nullptr == ctx->buffer) {
+    GGMLHEXAGON_LOG_DEBUG("size_aligned %d", size_aligned);
+    if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        GGMLHEXAGON_LOG_DEBUG("rpc mempool len %d", ctx->rpc_mempool_len);
+        GGMLHEXAGON_LOG_DEBUG("rpc mempool usage %d", ctx->rpc_mempool_usage);
+        GGML_ASSERT(ctx->rpc_mempool_usage <= ctx->rpc_mempool_len);
+        buffer_ctx->buffer = (static_cast<char*>(ctx->rpc_mempool)) + ctx->rpc_mempool_usage;
+        GGMLHEXAGON_LOG_DEBUG("buffer_ctx->buffer %p", buffer_ctx->buffer);
+        GGML_ASSERT(nullptr != buffer_ctx->buffer);
+        ctx->rpc_mempool_usage += size_aligned;
+    } else {
+        buffer_ctx->buffer = ggml_aligned_malloc(size_aligned);
+    }
+    buffer_ctx->buffer_size = size_aligned;
+    if (nullptr == buffer_ctx->buffer) {
         GGMLHEXAGON_LOG_WARN("%s: failed to allocate %d MiB\n", __func__, size / (1 << 20));
         return nullptr;
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("%s: succeed to allocate %d MiB\n", __func__, size / (1 << 20));
     }
 
-    return ggml_backend_buffer_init(buft, ggml_backend_hexagon_buffer_interface, ctx, size);
+    return ggml_backend_buffer_init(buft, ggml_backend_hexagon_buffer_interface, buffer_ctx, size);
 }
 
 static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
@@ -5429,7 +5481,7 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
 
     if (g_hexagon_mgr[ctx->device].backend != nullptr) {
         if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
-            ggmlhexagon_close_cdsp(ctx);
+            ggmlhexagon_deinit_cdsp(ctx);
         }
 
         delete backend;
@@ -5461,7 +5513,7 @@ static enum ggml_status ggmlhexagon_backend_graph_compute_general(ggml_backend_t
 }
 
 static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev) {
-    struct ggml_backend_hexagon_context *ctx = static_cast<ggml_backend_hexagon_context *>(dev->context);
+    struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(dev->context);
     if (nullptr == ctx) {
         GGMLHEXAGON_LOG_ERROR("pls check why ctx is null");
         return "unknown";
@@ -5470,6 +5522,7 @@ static const char * ggml_backend_hexagon_device_get_name(ggml_backend_dev_t dev)
 }
 
 static const char * ggml_backend_hexagon_device_get_description(ggml_backend_dev_t dev) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
     struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(dev->context);
     static char hexagon_device_desc[GGMLHEXAGON_TMPBUF_LEN];
     if (nullptr == ctx) {
@@ -5510,15 +5563,21 @@ static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_
         size_t rpc_ion_memsize = 0;
         size_t rpc_ion_usage   = 0;
         if (HWACCEL_CDSP != g_hexagon_appcfg.hwaccel_approach) {
+            //TODO: uniform rpc_ion_memsize and rpc_ion_usage between HWACCEL_CDSP and HWACCEL_QNN
             rpc_ion_memsize = ctx->instance->get_rpcmem_capacity();
             rpc_ion_usage   = ctx->instance->get_rpcmem_usage();
+            *total = rpc_ion_memsize * SIZE_IN_MB;
+            *free = (rpc_ion_memsize - rpc_ion_usage) * SIZE_IN_MB;
+            GGMLHEXAGON_LOG_DEBUG("rpc memsize %d M", rpc_ion_memsize);
+            GGMLHEXAGON_LOG_DEBUG("rpc usage %d M", rpc_ion_usage);
         } else {
-            ggmlhexagon_probe_dspinfo(ctx, &rpc_ion_memsize);
+            rpc_ion_memsize = ctx->rpc_mempool_capacity;
+            rpc_ion_usage   = ctx->rpc_mempool_usage;
+            *total = rpc_ion_memsize;
+            *free  = (rpc_ion_memsize - rpc_ion_usage);
+            GGMLHEXAGON_LOG_DEBUG("rpc memsize %d M", rpc_ion_memsize / SIZE_IN_MB);
+            GGMLHEXAGON_LOG_DEBUG("rpc usage %d M", rpc_ion_usage / SIZE_IN_MB);
         }
-        GGMLHEXAGON_LOG_DEBUG("rpc memsize %d M", rpc_ion_memsize);
-        GGMLHEXAGON_LOG_DEBUG("rpc usage %d M", rpc_ion_usage);
-        *total = rpc_ion_memsize * (1 << 20);
-        *free = (rpc_ion_memsize - rpc_ion_usage) * (1 << 20);
     }
 }
 
@@ -5536,6 +5595,7 @@ static enum ggml_backend_dev_type ggml_backend_hexagon_device_get_type(ggml_back
 
 static void ggml_backend_hexagon_device_get_props(ggml_backend_dev_t dev,
                                               struct ggml_backend_dev_props * props) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
     props->name        = ggml_backend_hexagon_device_get_name(dev);
     props->description = ggml_backend_hexagon_device_get_description(dev);
     props->type        = ggml_backend_hexagon_device_get_type(dev);
@@ -5591,7 +5651,7 @@ static ggml_backend_buffer_type_t ggml_backend_hexagon_buffer_type(size_t device
         return nullptr;
     }
 
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_qnn = {
+    static struct ggml_backend_buffer_type ggml_backend_buffer_type_hexagon = {
             /* .iface   = */ {
                                      /* .get_name         = */ ggml_backend_hexagon_buffer_type_name,
                                      /* .alloc_buffer     = */ ggml_backend_hexagon_buffer_type_alloc_buffer,
@@ -5601,14 +5661,22 @@ static ggml_backend_buffer_type_t ggml_backend_hexagon_buffer_type(size_t device
                                      /* .is_host          = */ ggml_backend_hexagon_buffer_is_host
                              },
             /* .device  = */ nullptr,
-            /* .context = */ nullptr,
+            /* .context = */ &g_hexagon_mgr[device_index],
     };
 
-    return &ggml_backend_buffer_type_qnn;
+    if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
+        //here is the trick:
+        //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
+        //and we need to re-use the g_hexagon_mgr
+        //so context is g_hexagon_mgr[HEXAGON_BACKEND_CDSP] rather than g_hexagon_mgr[0]
+        ggml_backend_buffer_type_hexagon.context = &g_hexagon_mgr[HEXAGON_BACKEND_CDSP];
+    }
+
+    return &ggml_backend_buffer_type_hexagon;
 }
 
 static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_buffer_type(ggml_backend_dev_t dev) {
-    ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *) dev->context;
+    ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)dev->context;
     return ggml_backend_hexagon_buffer_type(ctx->device);
 }
 
@@ -5680,7 +5748,12 @@ static void ggml_backend_hexagon_set_n_threads(ggml_backend_t backend, int n_thr
 }
 
 int ggml_backend_hexagon_get_device_count() {
-    return GGML_HEXAGON_MAX_DEVICES;
+    if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
+        GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP);
+        return 1;
+    } else {
+        return GGML_HEXAGON_MAX_DEVICES;
+    }
 }
 
 struct ggml_backend_hexagon_reg_context {
@@ -5694,7 +5767,12 @@ static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) {
 
 static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg) {
     GGML_UNUSED(reg);
-    return GGML_HEXAGON_MAX_DEVICES;
+    if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
+        GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP);
+        return 1;
+    } else {
+        return GGML_HEXAGON_MAX_DEVICES;
+    }
 }
 
 static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t reg, size_t index) {
@@ -5703,8 +5781,13 @@ static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t
 
     GGMLHEXAGON_LOG_DEBUG("index %d", index);
     ggml_backend_hexagon_reg_context * ctx = (ggml_backend_hexagon_reg_context *)reg->context;
-    GGML_ASSERT(index < ctx->devices.size());
-    return ctx->devices[index];
+    if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
+        GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP);
+        return ctx->devices[0];
+    } else {
+        GGML_ASSERT(index < ctx->devices.size());
+        return ctx->devices[index];
+    }
 }
 
 static void * ggml_backend_hexagon_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
@@ -5760,12 +5843,29 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
                 } else {
                     ggml_backend_hexagon_device_interface.supports_op = ggmlhexagon_can_handle_op_through_qnn;
                 }
-                ggml_backend_dev_t dev = new ggml_backend_device {
+                GGMLHEXAGON_LOG_DEBUG("create backend device for device %d", i);
+                ggml_backend_dev_t dev = new ggml_backend_device{
                         /* .iface       = */ ggml_backend_hexagon_device_interface,
                         /* .reg         = */ &reg,
                         /* .context     = */ &g_hexagon_mgr[i]
                 };
+                if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
+                    //here is the trick:
+                    //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
+                    //and we need to re-use the g_hexagon_mgr
+                    //so context is g_hexagon_mgr[HEXAGON_BACKEND_CDSP] rather than g_hexagon_mgr[0]
+                    dev->context = &g_hexagon_mgr[HEXAGON_BACKEND_CDSP];
+                }
                 ctx->devices.push_back(dev);
+
+                if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
+                    GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP);
+                    int result = ggmlhexagon_init_dsp(&g_hexagon_mgr[HEXAGON_BACKEND_CDSP]);
+                    if (0 != result) {
+                        GGMLHEXAGON_LOG_INFO("init hexagon dsp failure");
+                    }
+                    GGML_ASSERT(0 == result);
+                }
             }
 
             reg = ggml_backend_reg {
@@ -5783,13 +5883,13 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
 }
 
 const char * ggml_backend_hexagon_get_devname(size_t dev_num) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
     if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
-        if (dev_num == HEXAGON_BACKEND_GGML)
-            return "ggml";
-        else
+       if (HEXAGON_BACKEND_CDSP == dev_num)
             return "HEXAGON_BACKEND_CDSP";
     }
 
+    //fallback
     switch (dev_num) {
         case HEXAGON_BACKEND_QNNCPU:
             return "HEXAGON_BACKEND_QNN_CPU";
@@ -5860,9 +5960,7 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * qnn_lib_pat
     }
 
     std::string path = qnn_lib_path;
-    GGMLHEXAGON_LOG_DEBUG("lib_path %s", path.c_str());
     ggmlhexagon_set_runtime_path(device, path);
-
     if (nullptr != g_hexagon_mgr[device].backend) {
         GGMLHEXAGON_LOG_DEBUG("backend %d(%s) already loaded", device,
                          ggml_backend_hexagon_get_devname(device));
@@ -5876,9 +5974,7 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * qnn_lib_pat
         if (nullptr == instance)
             return nullptr;
     }
-
     ggml_backend_hexagon_interface.graph_compute = ggmlhexagon_backend_graph_compute_general;
-
     ggml_backend_t hexagon_backend = new ggml_backend{
             /* .guid      = */ ggml_backend_hexagon_guid(),
             /* .iface     = */ ggml_backend_hexagon_interface,
@@ -5894,8 +5990,6 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * qnn_lib_pat
             ggml_backend_hexagon_free(hexagon_backend);
             return nullptr;
         }
-        //ensure test-backend-ops get the correct backend name when hwaccel approach is 2(HWACCEL_CDSP)
-        memcpy(g_hexagon_mgr[device].name, "Hexagon-cDSP", strlen("Hexagon-cDSP"));
     } else {
         //got fully description of SoC when hwaccel approach is HWACCEL_QNN and backend is HEXAGON_BACKEND_QNNNPU
         GGMLHEXAGON_LOG_INFO("device name %s", ggml_backend_hexagon_device_get_description(hexagon_backend->device));
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index ce8d57938f72d..e12c2da7447bd 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -35,3 +35,5 @@ precision_mode = "fp16"
 enable_mulmat_cdsp = 1
 #enable/disable offload fp32 & quantized type mulmat to cDSP
 enable_q_mulmat = 0
+#enable/disable rpc ion memory pool
+enable_rpc_ion_mempool = 1

From 77755899b2b71315b37992b38ece297ca8c82eb9 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Mon, 31 Mar 2025 20:27:14 +0800
Subject: [PATCH 149/200] ggml-hexagon: rpc ion memory pool and
 test-backend-ops works fine in HWACCEL_CDSP approach at the first time

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 16 ++++++++++++----
 scripts/ggml-hexagon.cfg               |  2 ++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 525d8d928a96e..40a93824a704c 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -129,6 +129,8 @@
 class  qnn_instance;
 struct ggml_backend_hexagon_context;
 
+static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx);
+
 #if 0//def NDEBUG
 #define GGMLHEXAGON_DEBUG                               0
 #else
@@ -241,7 +243,7 @@ enum qcom_chipset_soc_model {
     SM8475 = 42,  // v69, SD 8+ Gen 1
     SM8550 = 43,  // v73, SD 8 Gen 2
     SM8650 = 57,  // v75, SD 8 Gen 3
-    SM8750 = 69,  // v79, SD 8 Gen 4
+    SM8750 = 69,  // v79, SD 8 Elite(aka 8 Gen 4)
 #if !defined(__ANDROID__) && !defined(__linux__)
     SC7280X     = 44,
     SC8280X     = 37,
@@ -316,6 +318,7 @@ struct hexagon_appcfg_t {
     int enable_mulmat_cdsp;     // enable/disable offload mulmat to cDSP
     int enable_q_mulmat;        // enable/disable offload fp32 & quantized mulmat to cDSP
     int enable_rpc_ion_mempool; // enable/disable rpc ion memory pool
+    int enable_rpc_dma_mempool; // enable/disable rpc dma memory pool
     const char * cfgfilename;
     const char * runtimelib_path;
 };
@@ -334,6 +337,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
         .enable_mulmat_cdsp     = 0,
         .enable_q_mulmat        = 0,
         .enable_rpc_ion_mempool = 0,
+        .enable_rpc_dma_mempool = 0,
         .cfgfilename            = "ggml-hexagon.cfg",
 #if defined(__ANDROID__)
 //Android command line program
@@ -394,7 +398,7 @@ static struct qcom_socinfo g_qnn_soc_info_table[] = {
                 .soc_model         = SM8750,
                 .htp_arch          = V79,
                 .vtcm_size_in_mb   = 8,
-                .soc_desc          = "Qualcomm SnapDragon 8 Gen 4"},
+                .soc_desc          = "Qualcomm SnapDragon 8 Elite(aka 8 Gen 4)"},
 
 #if !defined(__ANDROID__) && !defined(__linux__)
         /* Qualcomm SnapDragon 7c Gen 2 */
@@ -862,6 +866,8 @@ static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * c
         GGMLHEXAGON_LOG_INFO("offload GGML_OP_MULMAT: %s", g_hexagon_appcfg.enable_mulmat_cdsp ? "YES" : "NO");
         GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
         GGMLHEXAGON_LOG_INFO("using rpc ion memory pool: %s", g_hexagon_appcfg.enable_rpc_ion_mempool ? "YES" : "NO");
+        GGMLHEXAGON_LOG_INFO("using rpc dma memory pool: %s", g_hexagon_appcfg.enable_rpc_dma_mempool ? "YES" : "NO");
+        ggmlhexagon_probe_dspinfo(ctx);
     } else {
         GGMLHEXAGON_LOG_INFO("only offload GGML_OP_ADD: NO");
     }
@@ -1450,6 +1456,7 @@ static void ggmlhexagon_load_cfg() {
     qnncfg_instance.get_intvalue("cdsp", "enable_mulmat_cdsp", g_hexagon_appcfg.enable_mulmat_cdsp, 1);
     qnncfg_instance.get_intvalue("cdsp", "enable_q_mulmat", g_hexagon_appcfg.enable_q_mulmat, 0);
     qnncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 1);
+    qnncfg_instance.get_intvalue("cdsp", "enable_rpc_dma_mempool", g_hexagon_appcfg.enable_rpc_dma_mempool, 0);
     GGMLHEXAGON_LOG_INFO("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
                          ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
     GGMLHEXAGON_LOG_INFO("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend,
@@ -4814,7 +4821,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
         return 1;
     GGMLHEXAGON_LOG_INFO("init Hexagon DSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
     if (nullptr != ctx->rpc_mempool) {
-        GGMLHEXAGON_LOG_INFO("already init Hexagon DSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
+        GGMLHEXAGON_LOG_DEBUG("already init Hexagon DSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
         return 0;
     }
     ctx->ggmlop_handle = -1;
@@ -5480,13 +5487,14 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
     }
 
     if (g_hexagon_mgr[ctx->device].backend != nullptr) {
+        //print timestamp and dsp information before deinit cdsp, useful for troubleshooting
+        ggmlhexagon_print_running_timestamp(ctx);
         if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
             ggmlhexagon_deinit_cdsp(ctx);
         }
 
         delete backend;
         g_hexagon_mgr[ctx->device].backend = nullptr;
-        ggmlhexagon_print_running_timestamp(ctx);
     }
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
 }
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index e12c2da7447bd..0171957792010 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -37,3 +37,5 @@ enable_mulmat_cdsp = 1
 enable_q_mulmat = 0
 #enable/disable rpc ion memory pool
 enable_rpc_ion_mempool = 1
+#enable/disable rpc dma memory pool
+enable_rpc_dma_mempool = 0

From 2896ffcac588118a2d239ef0211dc02102c4af02 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Mon, 31 Mar 2025 21:09:04 +0800
Subject: [PATCH 150/200] ggml-hexagon: make comprision of mulmat performance
 between HWACCEL_QNN and HWACCEL_CDSP easily

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 45 +++++++++++---------------
 scripts/ggml-hexagon.cfg               | 17 ++++++----
 2 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 40a93824a704c..cc48137b2f646 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -309,14 +309,13 @@ struct hexagon_appcfg_t {
     int enable_perf;            // enable/disable perf of op function
     int print_tensors_info;     // enable/disable print tensors info in op function
     int dump_op_info;           // enable/disable dump op info in handle_op
+    int enable_q_mulmat;        // enable/disable offload quantized mulmat
     int precision_mode;         // 0: default 1:fp16
     int hvx_threads;
     int vtcm_size_in_mb;
     int enable_dlbc;
     int hwaccel_approach;       // 0: HWACCEL_QNN 1: HWACCEL_QNN_SINGLEGRAPH 2: HWACCEL_CDSP
     int hexagon_backend;        // 0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP
-    int enable_mulmat_cdsp;     // enable/disable offload mulmat to cDSP
-    int enable_q_mulmat;        // enable/disable offload fp32 & quantized mulmat to cDSP
     int enable_rpc_ion_mempool; // enable/disable rpc ion memory pool
     int enable_rpc_dma_mempool; // enable/disable rpc dma memory pool
     const char * cfgfilename;
@@ -328,14 +327,13 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
         .enable_perf            = 0,
         .print_tensors_info     = 0,
         .dump_op_info           = 0,
+        .enable_q_mulmat        = 0,
         .precision_mode         = 0,
         .hvx_threads            = 4,
         .vtcm_size_in_mb        = 8,
         .enable_dlbc            = 1,
         .hwaccel_approach       = HWACCEL_CDSP,
         .hexagon_backend        = HEXAGON_BACKEND_CDSP,
-        .enable_mulmat_cdsp     = 0,
-        .enable_q_mulmat        = 0,
         .enable_rpc_ion_mempool = 0,
         .enable_rpc_dma_mempool = 0,
         .cfgfilename            = "ggml-hexagon.cfg",
@@ -863,13 +861,12 @@ static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * c
                          ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend));
     ggmlhexagon_get_timestring(timestamp);
     if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
-        GGMLHEXAGON_LOG_INFO("offload GGML_OP_MULMAT: %s", g_hexagon_appcfg.enable_mulmat_cdsp ? "YES" : "NO");
         GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
         GGMLHEXAGON_LOG_INFO("using rpc ion memory pool: %s", g_hexagon_appcfg.enable_rpc_ion_mempool ? "YES" : "NO");
         GGMLHEXAGON_LOG_INFO("using rpc dma memory pool: %s", g_hexagon_appcfg.enable_rpc_dma_mempool ? "YES" : "NO");
         ggmlhexagon_probe_dspinfo(ctx);
     } else {
-        GGMLHEXAGON_LOG_INFO("only offload GGML_OP_ADD: NO");
+        GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
     }
     GGMLHEXAGON_LOG_INFO("running timestamp:%s", timestamp);
 }
@@ -1449,12 +1446,11 @@ static void ggmlhexagon_load_cfg() {
     qnncfg_instance.get_intvalue("general", "dump_op_info", g_hexagon_appcfg.dump_op_info, 0);
     qnncfg_instance.get_intvalue("general", "hwaccel_approach", g_hexagon_appcfg.hwaccel_approach, HWACCEL_CDSP);
     qnncfg_instance.get_intvalue("general", "hexagon_backend", g_hexagon_appcfg.hexagon_backend, HEXAGON_BACKEND_CDSP);
+    qnncfg_instance.get_intvalue("general", "enable_q_mulmat", g_hexagon_appcfg.enable_q_mulmat, 0);
     qnncfg_instance.get_intvalue("qnn", "hvx_threads", g_hexagon_appcfg.hvx_threads, 4);
     qnncfg_instance.get_intvalue("qnn", "vtcm_size_in_mb", g_hexagon_appcfg.vtcm_size_in_mb, 8);
     qnncfg_instance.get_intvalue("qnn", "enable_dlbc", g_hexagon_appcfg.enable_dlbc, 1);
     qnncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32");
-    qnncfg_instance.get_intvalue("cdsp", "enable_mulmat_cdsp", g_hexagon_appcfg.enable_mulmat_cdsp, 1);
-    qnncfg_instance.get_intvalue("cdsp", "enable_q_mulmat", g_hexagon_appcfg.enable_q_mulmat, 0);
     qnncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 1);
     qnncfg_instance.get_intvalue("cdsp", "enable_rpc_dma_mempool", g_hexagon_appcfg.enable_rpc_dma_mempool, 0);
     GGMLHEXAGON_LOG_INFO("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
@@ -3017,7 +3013,7 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, HEXAGONBackend
     _graph_name = graph_name;
     _device_id = device;
 
-    GGMLHEXAGON_LOG_DEBUG("[%s][%s]created", ggml_backend_hexagon_get_devname(device), graph_name.c_str());
+    //GGMLHEXAGON_LOG_DEBUG("[%s][%s]created", ggml_backend_hexagon_get_devname(device), graph_name.c_str());
 
     Qnn_ErrorHandle_t error = QNN_SUCCESS;
     if (HEXAGON_BACKEND_QNNNPU == device) {
@@ -3070,7 +3066,7 @@ int qnn_instance::init_qnn_graph(const std::string & graph_name, HEXAGONBackend
         }
         graph_configs.push_back(nullptr);
         error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), graph_configs.data(), &_qnn_graph_handle);
-        GGMLHEXAGON_LOG_DEBUG("[%s][%s]created graph %p", ggml_backend_hexagon_get_devname(device), graph_name.c_str(), _qnn_graph_handle);
+        //GGMLHEXAGON_LOG_DEBUG("[%s][%s]created graph %p", ggml_backend_hexagon_get_devname(device), graph_name.c_str(), _qnn_graph_handle);
     } else {
         error = _qnn_interface.qnn_graph_create(_qnn_context_handle, graph_name.c_str(), nullptr, &_qnn_graph_handle);
     }
@@ -3280,7 +3276,7 @@ void qnn_instance::htp_set_n_hvx_threads(size_t n_threads) {
     if (QNN_SUCCESS != result) {
         GGMLHEXAGON_LOG_WARN("failed to set QNN graph config: set hvx threads %d", n_threads);
     } else {
-        GGMLHEXAGON_LOG_INFO("succeed to set QNN graph config: set hvx threads %d", n_threads);
+        //GGMLHEXAGON_LOG_DEBUG("succeed to set QNN graph config: set hvx threads %d", n_threads);
     }
 }
 
@@ -3383,7 +3379,7 @@ static Qnn_OpConfig_t ggmlqnn_create_op_config(const char * name, const char * p
     } else {
         snprintf(opcfg_name, GGML_MAX_NAME, "opcfg_%s_%-8d", name, ggmlqnn_get_idx(QNN_OPCFG_INDEX));
     }
-    GGMLHEXAGON_LOG_DEBUG("create qnn opconfig %s", opcfg_name);
+    //GGMLHEXAGON_LOG_DEBUG("create qnn opconfig %s", opcfg_name);
     ggmlqnn_inc_idx(QNN_OPCFG_INDEX);
 
     Qnn_OpConfigV1_t v1 = {opcfg_name, package, type,
@@ -3564,7 +3560,7 @@ static void ggmlqnn_compute_elementwise(ggml_backend_hexagon_context * ctx, ggml
         }
         graph_handle = instance->get_qnn_graph_handle();
 
-        GGMLHEXAGON_LOG_DEBUG("graph_handle %p", graph_handle);
+        //GGMLHEXAGON_LOG_DEBUG("graph_handle %p", graph_handle);
         //create computational tensor
         p_tensor0 = ggmlqnn_create_compute_tensor(instance, graph_handle, src0, QNN_TENSOR_TYPE_APP_WRITE);
         if (2 == input_param_count) {
@@ -5063,7 +5059,7 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
         case GGML_OP_MUL_MAT:
         {
             ggmlhexagon_dump_op_info(op_tensor);
-            if (g_hexagon_appcfg.enable_q_mulmat)
+            if (1 == g_hexagon_appcfg.enable_q_mulmat)
                 return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q6_K
                        ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
             else
@@ -5142,10 +5138,13 @@ static bool ggmlhexagon_can_handle_op_through_qnn(ggml_backend_dev_t dev, const
                 return false;
 
             if (ctx->device == HEXAGON_BACKEND_QNNNPU) {
-                return (src0->type == GGML_TYPE_F32
+                if (1 == g_hexagon_appcfg.enable_q_mulmat)
+                    return (src0->type == GGML_TYPE_F32
                         || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
                         || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
                         ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+                else
+                    return (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && op_tensor->type == GGML_TYPE_F32);
             } else {
                 return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type))
                         && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
@@ -5298,10 +5297,8 @@ static bool ggmlhexagon_compute_forward(ggml_backend_t backend, struct ggml_tens
 
 struct ggml_backend_hexagon_buffer_context {
     ~ggml_backend_hexagon_buffer_context() {
-        GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
         if (buffer) {
             if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
-                GGMLHEXAGON_LOG_DEBUG("rpcmem %p, size %d", buffer, buffer_size);
                 //do nonthing here because rpc mempool was used for HWACCEL_CDSP
             } else {
                 ggml_aligned_free(buffer, 0);
@@ -5397,7 +5394,6 @@ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_ty
 
 static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
            ggml_backend_buffer_type_t buft, size_t size) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s, size %d", __func__, size);
     struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(buft->context);
     GGML_ASSERT(nullptr != ctx);
     GGMLHEXAGON_LOG_DEBUG("device %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
@@ -5413,14 +5409,10 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
     size_page = systeminfo.dwPageSize;
 #endif
     size_t size_aligned = size;
-    GGMLHEXAGON_LOG_DEBUG("size_aligned %d", size_aligned);
     if ((size_aligned % size_page) != 0) {
         size_aligned += (size_page - (size_aligned % size_page));
     }
-    GGMLHEXAGON_LOG_DEBUG("size_aligned %d", size_aligned);
     if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
-        GGMLHEXAGON_LOG_DEBUG("rpc mempool len %d", ctx->rpc_mempool_len);
-        GGMLHEXAGON_LOG_DEBUG("rpc mempool usage %d", ctx->rpc_mempool_usage);
         GGML_ASSERT(ctx->rpc_mempool_usage <= ctx->rpc_mempool_len);
         buffer_ctx->buffer = (static_cast<char*>(ctx->rpc_mempool)) + ctx->rpc_mempool_usage;
         GGMLHEXAGON_LOG_DEBUG("buffer_ctx->buffer %p", buffer_ctx->buffer);
@@ -5434,7 +5426,7 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
         GGMLHEXAGON_LOG_WARN("%s: failed to allocate %d MiB\n", __func__, size / (1 << 20));
         return nullptr;
     } else {
-        GGMLHEXAGON_LOG_DEBUG("%s: succeed to allocate %d MiB\n", __func__, size / (1 << 20));
+        //GGMLHEXAGON_LOG_DEBUG("%s: succeed to allocate %d MiB\n", __func__, size / (1 << 20));
     }
 
     return ggml_backend_buffer_init(buft, ggml_backend_hexagon_buffer_interface, buffer_ctx, size);
@@ -5577,14 +5569,14 @@ static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_
             *total = rpc_ion_memsize * SIZE_IN_MB;
             *free = (rpc_ion_memsize - rpc_ion_usage) * SIZE_IN_MB;
             GGMLHEXAGON_LOG_DEBUG("rpc memsize %d M", rpc_ion_memsize);
-            GGMLHEXAGON_LOG_DEBUG("rpc usage %d M", rpc_ion_usage);
+            GGMLHEXAGON_LOG_DEBUG("rpc usage %d M\n\n", rpc_ion_usage);
         } else {
             rpc_ion_memsize = ctx->rpc_mempool_capacity;
             rpc_ion_usage   = ctx->rpc_mempool_usage;
             *total = rpc_ion_memsize;
             *free  = (rpc_ion_memsize - rpc_ion_usage);
             GGMLHEXAGON_LOG_DEBUG("rpc memsize %d M", rpc_ion_memsize / SIZE_IN_MB);
-            GGMLHEXAGON_LOG_DEBUG("rpc usage %d M", rpc_ion_usage / SIZE_IN_MB);
+            GGMLHEXAGON_LOG_DEBUG("rpc usage %d M\n\n", rpc_ion_usage / SIZE_IN_MB);
         }
     }
 }
@@ -5891,13 +5883,12 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
 }
 
 const char * ggml_backend_hexagon_get_devname(size_t dev_num) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
     if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
        if (HEXAGON_BACKEND_CDSP == dev_num)
             return "HEXAGON_BACKEND_CDSP";
     }
 
-    //fallback
+    //fall through
     switch (dev_num) {
         case HEXAGON_BACKEND_QNNCPU:
             return "HEXAGON_BACKEND_QNN_CPU";
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index 0171957792010..7dcc9ff83aad7 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -17,9 +17,16 @@ print_tensors_info = 0
 # enable/disable dump op info in handle_op
 dump_op_info = 0
 
-# 0: hwaccel approach through QNN
-# 1: hwaccel approach through QNN-SINGLEGRAPH: mapping entire ggml cgraph to a single QNN graph
-# 2: hwaccel approach through Hexagon cDSP
+#enable/disable offload fp32 & quantized type mulmat
+#quatized type mulmat works fine in HWACCEL_QNN at the moment
+#quatized type mulmat doesn't works fine in HWACCEL_CDSP at the moment
+#this item will make mulmat performance comprision easily
+enable_q_mulmat = 0
+
+# 0: hwaccel approach through HWACCEL_QNN: offload ggml op to QNN
+# 1: hwaccel approach through HWACCEL_QNN_SINGLEGRAPH: mapping entire ggml cgraph to a single QNN graph
+# 2: hwaccel approach through HWACCEL_CDSP:offload ggml op to cDSP directly
+# HWACCEL_QNN_SINGLEGRAPH not supported at the moment
 hwaccel_approach = 2
 
 #hwaccel approach through QNN
@@ -31,10 +38,6 @@ precision_mode = "fp16"
 
 #hwaccel approach through cDSP
 [cdsp]
-#enable/disable offload mulmat to cDSP
-enable_mulmat_cdsp = 1
-#enable/disable offload fp32 & quantized type mulmat to cDSP
-enable_q_mulmat = 0
 #enable/disable rpc ion memory pool
 enable_rpc_ion_mempool = 1
 #enable/disable rpc dma memory pool

From e2ae8048e28a1d4e7438d1758487c7aab2175e52 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Mon, 31 Mar 2025 21:55:11 +0800
Subject: [PATCH 151/200] ggml-hexagon: release ggml-hexagon v1.00

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 45 ++++++++++++++++----------
 scripts/ggml-hexagon.cfg               |  8 ++---
 2 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index cc48137b2f646..0ae9d053b6607 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -128,7 +128,6 @@
 // =================================================================================================
 class  qnn_instance;
 struct ggml_backend_hexagon_context;
-
 static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx);
 
 #if 0//def NDEBUG
@@ -320,6 +319,7 @@ struct hexagon_appcfg_t {
     int enable_rpc_dma_mempool; // enable/disable rpc dma memory pool
     const char * cfgfilename;
     const char * runtimelib_path;
+    char ggml_hexagon_version[GGMLHEXAGON_TMPBUF_LEN];
 };
 
 static struct hexagon_appcfg_t g_hexagon_appcfg = {
@@ -345,6 +345,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
 #elif defined(_WIN32)
         .qnn_runtimelib_path    = "C:\\",
 #endif
+        .ggml_hexagon_version   = {"1.00"},
 };
 
 //file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
@@ -855,15 +856,16 @@ static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * c
     char timestamp[GGMLHEXAGON_TMPBUF_LEN];
     memset(timestamp, 0, GGMLHEXAGON_TMPBUF_LEN);
 
-    GGMLHEXAGON_LOG_INFO("hwaccel approach is %d(%s)", g_hexagon_appcfg.hwaccel_approach,
+    GGMLHEXAGON_LOG_INFO("ggml_hexagon_version:             %s", g_hexagon_appcfg.ggml_hexagon_version);
+    GGMLHEXAGON_LOG_INFO("hwaccel approach:                 %d(%s)", g_hexagon_appcfg.hwaccel_approach,
                      ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
-    GGMLHEXAGON_LOG_INFO("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend,
-                         ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend));
+    GGMLHEXAGON_LOG_INFO("hexagon_backend:                  %d(%s)", g_hexagon_appcfg.hexagon_backend,
+                     ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend));
     ggmlhexagon_get_timestring(timestamp);
     if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
         GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
-        GGMLHEXAGON_LOG_INFO("using rpc ion memory pool: %s", g_hexagon_appcfg.enable_rpc_ion_mempool ? "YES" : "NO");
-        GGMLHEXAGON_LOG_INFO("using rpc dma memory pool: %s", g_hexagon_appcfg.enable_rpc_dma_mempool ? "YES" : "NO");
+        GGMLHEXAGON_LOG_INFO("using rpc ion memory pool:        %s", g_hexagon_appcfg.enable_rpc_ion_mempool ? "YES" : "NO");
+        GGMLHEXAGON_LOG_INFO("using rpc dma memory pool:        %s", g_hexagon_appcfg.enable_rpc_dma_mempool ? "YES" : "NO");
         ggmlhexagon_probe_dspinfo(ctx);
     } else {
         GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
@@ -1440,6 +1442,8 @@ static void ggmlhexagon_load_cfg() {
         GGMLHEXAGON_LOG_INFO("%s", tmposs.str().c_str());
     });
     std::string precision_mode;
+    std::string ggml_hexagon_version;
+    qnncfg_instance.get_stringvalue("general", "ggml_hexagon_version", ggml_hexagon_version, "1.00");
     qnncfg_instance.get_intvalue("general", "print_qnn_internal_log", g_hexagon_appcfg.print_qnn_internal_log, 0);
     qnncfg_instance.get_intvalue("general", "enable_perf", g_hexagon_appcfg.enable_perf, 1);
     qnncfg_instance.get_intvalue("general", "print_tensors_info", g_hexagon_appcfg.print_tensors_info, 0);
@@ -1453,6 +1457,8 @@ static void ggmlhexagon_load_cfg() {
     qnncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32");
     qnncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 1);
     qnncfg_instance.get_intvalue("cdsp", "enable_rpc_dma_mempool", g_hexagon_appcfg.enable_rpc_dma_mempool, 0);
+    GGMLHEXAGON_LOG_INFO("ggml_hexagon_version=%s", ggml_hexagon_version.c_str());
+    memcpy(g_hexagon_appcfg.ggml_hexagon_version, ggml_hexagon_version.c_str(), strlen(ggml_hexagon_version.c_str()));
     GGMLHEXAGON_LOG_INFO("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
                          ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
     GGMLHEXAGON_LOG_INFO("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend,
@@ -1479,6 +1485,11 @@ static bool ggmlhexagon_check_valid_appcfg() {
             GGMLHEXAGON_LOG_INFO("hwaccel_approach HWACCEL_CDSP must match with hexagon_backend HEXAGON_BACKEND_CDSP");
             is_valid_appcfg = false;
         }
+
+        if ((1 == g_hexagon_appcfg.enable_rpc_ion_mempool) && (1 == g_hexagon_appcfg.enable_rpc_dma_mempool)) {
+            GGMLHEXAGON_LOG_INFO("rpc ion mempool and rpc dma mempool cannot be enabled at the same time");
+            is_valid_appcfg = false;
+        }
     }
 
     if (!is_valid_appcfg) {
@@ -4719,6 +4730,10 @@ static void ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
         remote_register_buf(ctx->rpc_mempool, ctx->rpc_mempool_len, ctx->rpc_mempool_handle);
     }
 
+    if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_dma_mempool)) {
+        //TODO
+    }
+
     return;
 }
 
@@ -4790,7 +4805,7 @@ static void ggmlhexagon_deinit_cdsp(ggml_backend_hexagon_context * ctx) {
 
     ggmlhexagon_deinit_rpcmempool(ctx);
 
-    ctx->domain_id                = -1;
+    ctx->domain_id             = -1;
     GGMLHEXAGON_LOG_INFO("leave %s", __func__);
 }
 
@@ -5042,12 +5057,8 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
 
     const struct ggml_tensor * src0 = op_tensor->src[0];
     const struct ggml_tensor * src1 = op_tensor->src[1];
-    const int64_t ne00        = src0->ne[0];
-    const uint32_t src0_rank  = ggml_n_dims(src0);
-    const uint32_t src1_rank  = ggml_n_dims(src1);
     switch (op_tensor->op) {
         case GGML_OP_ADD:
-        case GGML_OP_SUB:
         {
             if (!ggml_are_same_shape(src0, src1)) {
                 return false;
@@ -5409,7 +5420,7 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
     size_page = systeminfo.dwPageSize;
 #endif
     size_t size_aligned = size;
-    if ((size_aligned % size_page) != 0) {
+    if (0 != (size_aligned % size_page)) {
         size_aligned += (size_page - (size_aligned % size_page));
     }
     if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
@@ -5423,10 +5434,10 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
     }
     buffer_ctx->buffer_size = size_aligned;
     if (nullptr == buffer_ctx->buffer) {
-        GGMLHEXAGON_LOG_WARN("%s: failed to allocate %d MiB\n", __func__, size / (1 << 20));
+        GGMLHEXAGON_LOG_WARN("%s: failed to allocate %d MiB\n", __func__, size / SIZE_IN_MB);
         return nullptr;
     } else {
-        //GGMLHEXAGON_LOG_DEBUG("%s: succeed to allocate %d MiB\n", __func__, size / (1 << 20));
+        //GGMLHEXAGON_LOG_DEBUG("%s: succeed to allocate %d MiB\n", __func__, size / SIZE_IN_MB);
     }
 
     return ggml_backend_buffer_init(buft, ggml_backend_hexagon_buffer_interface, buffer_ctx, size);
@@ -5478,7 +5489,7 @@ static void ggml_backend_hexagon_free(ggml_backend_t backend) {
         g_hexagon_mgr[ctx->device].instance = nullptr;
     }
 
-    if (g_hexagon_mgr[ctx->device].backend != nullptr) {
+    if (nullptr != g_hexagon_mgr[ctx->device].backend) {
         //print timestamp and dsp information before deinit cdsp, useful for troubleshooting
         ggmlhexagon_print_running_timestamp(ctx);
         if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
@@ -5595,7 +5606,6 @@ static enum ggml_backend_dev_type ggml_backend_hexagon_device_get_type(ggml_back
 
 static void ggml_backend_hexagon_device_get_props(ggml_backend_dev_t dev,
                                               struct ggml_backend_dev_props * props) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
     props->name        = ggml_backend_hexagon_device_get_name(dev);
     props->description = ggml_backend_hexagon_device_get_description(dev);
     props->type        = ggml_backend_hexagon_device_get_type(dev);
@@ -5858,6 +5868,7 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
                 }
                 ctx->devices.push_back(dev);
 
+                //here is the trick: make cDSP rpc memory pool happy because ggml's backend subsystem need this
                 if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
                     GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP);
                     int result = ggmlhexagon_init_dsp(&g_hexagon_mgr[HEXAGON_BACKEND_CDSP]);
@@ -5888,7 +5899,7 @@ const char * ggml_backend_hexagon_get_devname(size_t dev_num) {
             return "HEXAGON_BACKEND_CDSP";
     }
 
-    //fall through
+    //here is the trick: fall through for various scenarios
     switch (dev_num) {
         case HEXAGON_BACKEND_QNNCPU:
             return "HEXAGON_BACKEND_QNN_CPU";
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index 7dcc9ff83aad7..54c4eb6e1f851 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -1,4 +1,5 @@
 [general]
+version = "1.00"
 #0: HEXAGON_BACKEND_QNNCPU
 #1: HEXAGON_BACKEND_QNNGPU
 #2: HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP
@@ -7,17 +8,14 @@ hexagon_backend = 2
 
 # enable/disable QNN's internal log
 print_qnn_internal_log = 0
-
 # enable/disable perf of op function
 enable_perf = 1
-
 # enable/disable print tensors info in op function
 print_tensors_info = 0
-
 # enable/disable dump op info in handle_op
 dump_op_info = 0
 
-#enable/disable offload fp32 & quantized type mulmat
+#enable/disable offload quantized type mulmat
 #quatized type mulmat works fine in HWACCEL_QNN at the moment
 #quatized type mulmat doesn't works fine in HWACCEL_CDSP at the moment
 #this item will make mulmat performance comprision easily
@@ -39,6 +37,6 @@ precision_mode = "fp16"
 #hwaccel approach through cDSP
 [cdsp]
 #enable/disable rpc ion memory pool
-enable_rpc_ion_mempool = 1
+enable_rpc_ion_mempool = 0
 #enable/disable rpc dma memory pool
 enable_rpc_dma_mempool = 0

From be973b49074bb0cf4647dd578ea302bfd4e4bd59 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 1 Apr 2025 08:04:38 +0800
Subject: [PATCH 152/200] ggml-hexagon: rebase to upstream

---
 scripts/build-run-android.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 3f1069be4978a..d970fcfb8a18f 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -26,7 +26,9 @@ HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
 #v79 --- Snapdragon 8 Elite(aka Gen4)
 HTP_ARCH_VERSION=v75
 
-qnnparams=" -mg 2 -ngl 99 "
+#running_params=" -mg 2 -ngl 99 "
+#running_params=" -mg 2 -ngl 99 -t 8 -fa 1 "
+running_params=" -mg 2 -ngl 99 -t 8 "
 
 function dump_vars()
 {
@@ -209,7 +211,7 @@ function run_llamacli()
 
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-cli ${qnnparams} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
+               && ${REMOTE_PATH}/llama-cli ${running_params} -no-cnv -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
 
 }
 
@@ -220,7 +222,7 @@ function run_llamabench()
 
     adb shell "cd ${REMOTE_PATH} \
                && export LD_LIBRARY_PATH=${REMOTE_PATH} \
-               && ${REMOTE_PATH}/llama-bench ${qnnparams} -m ${GGUF_MODEL_NAME}"
+               && ${REMOTE_PATH}/llama-bench ${running_params} -m ${GGUF_MODEL_NAME}"
 
 }
 

From ab2712d93a962789458476272b81925918d145be Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 1 Apr 2025 10:00:35 +0800
Subject: [PATCH 153/200] ggml-hexagon: check configuration of
 enable_rpc_dma_mempool in function ggmlhexagon_check_valid_appcfg

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 0ae9d053b6607..a64b767b36b21 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1490,6 +1490,11 @@ static bool ggmlhexagon_check_valid_appcfg() {
             GGMLHEXAGON_LOG_INFO("rpc ion mempool and rpc dma mempool cannot be enabled at the same time");
             is_valid_appcfg = false;
         }
+
+        if (1 == g_hexagon_appcfg.enable_rpc_dma_mempool) {
+            GGMLHEXAGON_LOG_INFO("rpc dma mempool not supported");
+            is_valid_appcfg = false;
+        }
     }
 
     if (!is_valid_appcfg) {

From 06d25092cba402f8f75ea6e26c75eaf880ec584d Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 1 Apr 2025 11:46:52 +0800
Subject: [PATCH 154/200] ggml-hexagon: uniform rpc_ion_memsize and
 rpc_ion_usage between HWACCEL_CDSP and HWACCEL_QNN

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 35 ++++++++++----------------
 1 file changed, 13 insertions(+), 22 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index a64b767b36b21..40b72b400ba8e 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2250,8 +2250,8 @@ class qnn_instance {
     pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
     std::unordered_map<void *, void *> _rpcmem_store_map;
     std::unordered_map<void *, size_t> _rpcmem_usage_map;
-    size_t                             _rpcmem_usage    = 0;   // mempool usage in Mbytes
-    size_t                             _rpcmem_capacity = 512; // mempool size  in Mbytes
+    size_t                             _rpcmem_usage    = 0;   // mempool usage in bytes
+    size_t                             _rpcmem_capacity = 0;   // mempool size  in bytes
 
     std::string _graph_name;
     HEXAGONBackend _device_id;
@@ -2289,8 +2289,8 @@ void * qnn_instance::alloc_rpcmem_internal(size_t bytes, size_t alignment) {
 }
 
 void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
-    if (_rpcmem_usage > (_rpcmem_capacity - 8)) { // reserve 8Mbytes in rpc mempool
-        GGMLHEXAGON_LOG_WARN("rpc mempool capcaity: %d MB, usage: %d MB", _rpcmem_capacity, _rpcmem_usage);
+    if (_rpcmem_usage > (_rpcmem_capacity - (8 * SIZE_IN_MB))) { // reserve 8Mbytes in rpc mempool
+        GGMLHEXAGON_LOG_WARN("rpc mempool capacity: %d MB, usage: %d MB", _rpcmem_capacity / SIZE_IN_MB, _rpcmem_usage / SIZE_IN_MB);
         return nullptr;
     }
 
@@ -2299,9 +2299,7 @@ void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
         return nullptr;
     _rpcmem_usage_map.insert(std::pair<void *, size_t>(aligned_buf, bytes));
 
-    size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
-    rpcmem_usage_in_bytes += bytes;
-    _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
+    _rpcmem_usage += bytes;
     return aligned_buf;
 }
 
@@ -2319,9 +2317,7 @@ void qnn_instance::free_rpcmem(void * buf) {
             void * rpcbuffer = it->first;
             if (buf == rpcbuffer) {
                 rpcbuffer_size = it->second;
-                size_t rpcmem_usage_in_bytes = _rpcmem_usage * (1 << 20);
-                rpcmem_usage_in_bytes -= rpcbuffer_size;
-                _rpcmem_usage = rpcmem_usage_in_bytes / ( 1 << 20);
+                _rpcmem_usage -= rpcbuffer_size;
             }
         }
         if (rpcbuffer_size != 0) {
@@ -3191,11 +3187,11 @@ void qnn_instance::htp_probe_rpc_meminfo() {
         }
     }
     if (candidate_size > _rpcmem_capacity)
-        _rpcmem_capacity = candidate_size;
+        _rpcmem_capacity = candidate_size * SIZE_IN_MB;
 
     free_rpcmem();
     _rpcmem_usage = 0;
-    GGMLHEXAGON_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity);
+    GGMLHEXAGON_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity / SIZE_IN_MB);
 }
 
 void qnn_instance::htp_print_info() {
@@ -5579,21 +5575,16 @@ static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_
         size_t rpc_ion_memsize = 0;
         size_t rpc_ion_usage   = 0;
         if (HWACCEL_CDSP != g_hexagon_appcfg.hwaccel_approach) {
-            //TODO: uniform rpc_ion_memsize and rpc_ion_usage between HWACCEL_CDSP and HWACCEL_QNN
             rpc_ion_memsize = ctx->instance->get_rpcmem_capacity();
             rpc_ion_usage   = ctx->instance->get_rpcmem_usage();
-            *total = rpc_ion_memsize * SIZE_IN_MB;
-            *free = (rpc_ion_memsize - rpc_ion_usage) * SIZE_IN_MB;
-            GGMLHEXAGON_LOG_DEBUG("rpc memsize %d M", rpc_ion_memsize);
-            GGMLHEXAGON_LOG_DEBUG("rpc usage %d M\n\n", rpc_ion_usage);
         } else {
             rpc_ion_memsize = ctx->rpc_mempool_capacity;
-            rpc_ion_usage   = ctx->rpc_mempool_usage;
-            *total = rpc_ion_memsize;
-            *free  = (rpc_ion_memsize - rpc_ion_usage);
-            GGMLHEXAGON_LOG_DEBUG("rpc memsize %d M", rpc_ion_memsize / SIZE_IN_MB);
-            GGMLHEXAGON_LOG_DEBUG("rpc usage %d M\n\n", rpc_ion_usage / SIZE_IN_MB);
+            rpc_ion_usage = ctx->rpc_mempool_usage;
         }
+        *total = rpc_ion_memsize;
+        *free = (rpc_ion_memsize - rpc_ion_usage);
+        GGMLHEXAGON_LOG_DEBUG("rpc memsize %d M", rpc_ion_memsize / SIZE_IN_MB);
+        GGMLHEXAGON_LOG_DEBUG("rpc usage %d M\n\n", rpc_ion_usage / SIZE_IN_MB);
     }
 }
 

From 8a5c5bda9761c7c9f90efc7ce366d10d9ee961ab Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 1 Apr 2025 22:20:08 +0800
Subject: [PATCH 155/200] ggml-hexagon: make buffer mechanism more clear in
 HWACCEL_CDSP approach

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 113 ++++++++++++++++++++++---
 1 file changed, 99 insertions(+), 14 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 40b72b400ba8e..d7b441b4dc270 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -5401,7 +5401,13 @@ static ggml_backend_buffer_i ggml_backend_hexagon_buffer_interface = {
 
 static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_type_t buft) {
     GGML_UNUSED(buft);
-    return "hexagon-buffer";
+    if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        return "hexagon-ion-buffer";
+    }
+    if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_dma_mempool)) {
+        return "hexagon-dma-buffer";
+    }
+    return "hexagon-normal-buffer";
 }
 
 static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
@@ -5424,10 +5430,10 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
     if (0 != (size_aligned % size_page)) {
         size_aligned += (size_page - (size_aligned % size_page));
     }
-    if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
-        GGML_ASSERT(ctx->rpc_mempool_usage <= ctx->rpc_mempool_len);
+    if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        GGML_ASSERT(size + ctx->rpc_mempool_usage <= ctx->rpc_mempool_len);
         buffer_ctx->buffer = (static_cast<char*>(ctx->rpc_mempool)) + ctx->rpc_mempool_usage;
-        GGMLHEXAGON_LOG_DEBUG("buffer_ctx->buffer %p", buffer_ctx->buffer);
+        GGMLHEXAGON_LOG_DEBUG("size %d(%d M), buffer_ctx->buffer %p", size, size / SIZE_IN_MB, buffer_ctx->buffer);
         GGML_ASSERT(nullptr != buffer_ctx->buffer);
         ctx->rpc_mempool_usage += size_aligned;
     } else {
@@ -5455,6 +5461,10 @@ static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_
     return (2 * (1 << 29));
 }
 
+static bool ggml_backend_buft_is_hexagon(ggml_backend_buffer_type_t buft) {
+    return buft->iface.get_name == ggml_backend_hexagon_buffer_type_name;
+}
+
 static bool ggml_backend_hexagon_buffer_is_host(ggml_backend_buffer_type_t buft) {
     GGML_UNUSED(buft);
     return true;
@@ -5579,7 +5589,7 @@ static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_
             rpc_ion_usage   = ctx->instance->get_rpcmem_usage();
         } else {
             rpc_ion_memsize = ctx->rpc_mempool_capacity;
-            rpc_ion_usage = ctx->rpc_mempool_usage;
+            rpc_ion_usage   = ctx->rpc_mempool_usage;
         }
         *total = rpc_ion_memsize;
         *free = (rpc_ion_memsize - rpc_ion_usage);
@@ -5590,6 +5600,10 @@ static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_
 
 static enum ggml_backend_dev_type ggml_backend_hexagon_device_get_type(ggml_backend_dev_t dev) {
     struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(dev->context);
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        return GGML_BACKEND_DEVICE_TYPE_GPU;
+    }
+
     if (HEXAGON_BACKEND_QNNCPU == ctx->device)
         return GGML_BACKEND_DEVICE_TYPE_ACCEL;
     else if (HEXAGON_BACKEND_QNNGPU == ctx->device)
@@ -5608,10 +5622,15 @@ static void ggml_backend_hexagon_device_get_props(ggml_backend_dev_t dev,
     ggml_backend_hexagon_device_get_memory(dev, &props->memory_free, &props->memory_total);
     props->caps = {
             /* .async                 = */ false,
-            /* .host_buffer           = */ false,
-            /* .buffer_from_host_ptr  = */ true,
+            /* .host_buffer           = */ true,
+            /* .buffer_from_host_ptr  = */ false,
             /* .events                = */ false,
     };
+
+    if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        //don't use system memory in this scenario
+        props->caps.host_buffer       = false;
+    }
 }
 
 static ggml_backend_t ggml_backend_hexagon_device_init_backend(ggml_backend_dev_t dev, const char * params) {
@@ -5670,7 +5689,7 @@ static ggml_backend_buffer_type_t ggml_backend_hexagon_buffer_type(size_t device
             /* .context = */ &g_hexagon_mgr[device_index],
     };
 
-    if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
         //here is the trick:
         //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
         //and we need to re-use the g_hexagon_mgr
@@ -5681,6 +5700,60 @@ static ggml_backend_buffer_type_t ggml_backend_hexagon_buffer_type(size_t device
     return &ggml_backend_buffer_type_hexagon;
 }
 
+static const char * ggml_backend_hexagon_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+    GGML_UNUSED(buft);
+    return "Hexagon_Host";
+}
+
+static const char * ggml_backend_hexagon_host_buffer_name(ggml_backend_buffer_t buffer) {
+    GGML_UNUSED(buffer);
+    return "Hexagon_Host";
+}
+
+static void ggml_backend_hexagon_host_buffer_free(ggml_backend_buffer_t buffer) {
+    ggml_aligned_free(buffer->context, 0);
+}
+
+static void * ggml_hexagon_host_malloc(ggml_backend_buffer_type_t buft, size_t size) {
+    return ggml_aligned_malloc(size);
+}
+
+static ggml_backend_buffer_t ggml_backend_hexagon_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+    void * host_ptr = ggml_hexagon_host_malloc(buft, size);
+
+    if (nullptr == host_ptr) {
+        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+    }
+
+    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(host_ptr, size);
+    buffer->buft = buft;
+    buffer->iface.free_buffer = ggml_backend_hexagon_host_buffer_free;
+
+    return buffer;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_hexagon_host_buffer_type() {
+    static struct ggml_backend_buffer_type ggml_backend_hexagon_buffer_type_host = {
+            /* .iface    = */ {
+                                      /* .get_name         = */ ggml_backend_hexagon_host_buffer_type_name,
+                                      /* .alloc_buffer     = */ ggml_backend_hexagon_host_buffer_type_alloc_buffer,
+                                      /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+                                      /* .get_max_size     = */ nullptr,
+                                      /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
+                                      /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
+                              },
+            /* .device   = */ ggml_backend_reg_dev_get(ggml_backend_hexagon_reg(), 0),
+            /* .context  = */ nullptr,
+    };
+
+    return &ggml_backend_hexagon_buffer_type_host;
+}
+
+static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_host_buffer_type(ggml_backend_dev_t dev) {
+    GGML_UNUSED(dev);
+    return ggml_backend_hexagon_host_buffer_type();
+}
+
 static ggml_backend_buffer_type_t ggml_backend_hexagon_device_get_buffer_type(ggml_backend_dev_t dev) {
     ggml_backend_hexagon_context * ctx = (ggml_backend_hexagon_context *)dev->context;
     return ggml_backend_hexagon_buffer_type(ctx->device);
@@ -5695,7 +5768,14 @@ static ggml_backend_buffer_t ggml_backend_hexagon_device_buffer_from_host_ptr(gg
 }
 
 static bool ggml_backend_hexagon_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(dev);
+    if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        if (ggml_backend_buft_is_hexagon(buft)) {
+            ggml_backend_hexagon_context * dev_ctx  = (ggml_backend_hexagon_context *)dev->context;
+            ggml_backend_hexagon_context * buft_ctx = (ggml_backend_hexagon_context *)buft->context;
+            return buft_ctx->device == dev_ctx->device;
+        }
+    }
+
     return ggml_backend_buft_is_host(buft);
 }
 
@@ -5707,7 +5787,7 @@ static struct ggml_backend_device_i ggml_backend_hexagon_device_interface = {
         /* .get_props            = */ ggml_backend_hexagon_device_get_props,
         /* .init_backend         = */ ggml_backend_hexagon_device_init_backend,
         /* .get_buffer_type      = */ ggml_backend_hexagon_device_get_buffer_type,
-        /* .get_host_buffer_type = */ nullptr,
+        /* .get_host_buffer_type = */ ggml_backend_hexagon_device_get_host_buffer_type,
         /* .buffer_from_host_ptr = */ ggml_backend_hexagon_device_buffer_from_host_ptr,
         /* .supports_op          = */ nullptr,
         /* .supports_buft        = */ ggml_backend_hexagon_device_supports_buft,
@@ -5844,18 +5924,23 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
             ggml_backend_hexagon_reg_context * ctx = new ggml_backend_hexagon_reg_context;
 
             for (int i = 0; i < ggml_backend_hexagon_get_device_count(); i++) {
-                if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
+                if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
                     ggml_backend_hexagon_device_interface.supports_op = ggmlhexagon_can_handle_op_through_cdsp;
                 } else {
                     ggml_backend_hexagon_device_interface.supports_op = ggmlhexagon_can_handle_op_through_qnn;
                 }
+                if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+                    //don't use system memory in this scenario
+                    ggml_backend_hexagon_device_interface.get_host_buffer_type = nullptr;
+                }
+
                 GGMLHEXAGON_LOG_DEBUG("create backend device for device %d", i);
                 ggml_backend_dev_t dev = new ggml_backend_device{
                         /* .iface       = */ ggml_backend_hexagon_device_interface,
                         /* .reg         = */ &reg,
                         /* .context     = */ &g_hexagon_mgr[i]
                 };
-                if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
+                if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
                     //here is the trick:
                     //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
                     //and we need to re-use the g_hexagon_mgr
@@ -5865,8 +5950,8 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
                 ctx->devices.push_back(dev);
 
                 //here is the trick: make cDSP rpc memory pool happy because ggml's backend subsystem need this
-                if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
-                    GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP);
+                if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+                    GGML_ASSERT(HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend);
                     int result = ggmlhexagon_init_dsp(&g_hexagon_mgr[HEXAGON_BACKEND_CDSP]);
                     if (0 != result) {
                         GGMLHEXAGON_LOG_INFO("init hexagon dsp failure");

From 119be625f61b716ade397fe55b177533b70ce571 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 2 Apr 2025 12:04:48 +0800
Subject: [PATCH 156/200] ggml-hexagon: add perf function in hexagon kernerls
 on cDSP side

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp      |  5 +++-
 ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c | 26 +++++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index d7b441b4dc270..bdc9decb86050 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -5867,8 +5867,11 @@ static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t
 
     GGMLHEXAGON_LOG_DEBUG("index %d", index);
     ggml_backend_hexagon_reg_context * ctx = (ggml_backend_hexagon_reg_context *)reg->context;
-    if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
         GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP);
+        //here is the trick:
+        //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
+        //so return ctx->devices[0]
         return ctx->devices[0];
     } else {
         GGML_ASSERT(index < ctx->devices.size());
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
index 355852b394dfc..06a56475f3db5 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
@@ -1149,6 +1149,25 @@ static void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, co
 
 }
 
+static inline uint64 hexagon_perf_get_time_us(void)
+{
+    unsigned long long count;
+    asm volatile (" %0 = c31:30 " : "=r"(count));
+    return (uint64)(count) * 10ull / 192ull;
+}
+
+static void ggml_time_init(void) {
+
+}
+
+static int64_t ggml_time_ms(void) {
+    return hexagon_perf_get_time_us() * 1000;
+}
+
+int64_t ggml_time_us(void) {
+    return hexagon_perf_get_time_us();
+}
+
 // =================================================================================================
 //  section-4: ggml-hexagon kernel helper function
 // =================================================================================================
@@ -1266,6 +1285,8 @@ static void ggml_compute_forward_add_f32(
         const struct ggml_tensor * src1,
         struct ggml_tensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    uint64_t start_time = ggml_time_us();
+
     memcpy(dst->ne, src1->ne, 16);
     memcpy(dst->nb, src1->nb, 16);
     ggmlhexagon_dump_tensor(src0, 1);
@@ -1328,6 +1349,11 @@ static void ggml_compute_forward_add_f32(
             }
         }
     }
+
+    uint64_t end_time = ggml_time_us();
+    uint64_t duration = (end_time - start_time);
+    GGMLHEXAGON_LOG_DEBUG("duration %llu us", duration);
+
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
 }
 

From 895d4033d0cd3f39857ad6cef5d20192f33ece32 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 2 Apr 2025 15:05:45 +0800
Subject: [PATCH 157/200] ggml-hexagon: fix a stupid issue of why set rpc
 latency failure and improve NPU performance significantly

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index bdc9decb86050..bd5fb6909ce08 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -4473,7 +4473,7 @@ static bool ggmlhexagon_is_async_fastrpc_supported(int domain) {
     return false;
 }
 
-static void ggmlhexagon_set_rpc_latency(int domain, int qos, int latency) {
+static void ggmlhexagon_set_rpc_latency(remote_handle64 handle, int qos, int latency) {
     int hexagon_error = AEE_SUCCESS;
 
     if (remote_handle_control) {
@@ -4486,9 +4486,8 @@ static void ggmlhexagon_set_rpc_latency(int domain, int qos, int latency) {
 */
         data.enable   = qos;
         data.latency  = latency;
-        hexagon_error = remote_handle64_control(DSPRPC_GET_DSP_INFO, DSPRPC_CONTROL_LATENCY, (void*)&data, sizeof(data));
+        hexagon_error = remote_handle64_control(handle, DSPRPC_CONTROL_LATENCY, (void*)&data, sizeof(data));
         if (hexagon_error != AEE_SUCCESS) {
-            //FIXME: why set rpc latency failure
             GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error);
             goto bail;
         } else {
@@ -4940,7 +4939,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
         GGMLHEXAGON_LOG_INFO("only support offload GGML_OP_ADD and GGML_OP_MUL_MAT to cDSP currently");
         ggmlhexagon_probe_dspinfo(ctx);
         ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1);
-        ggmlhexagon_set_rpc_latency(domain_id, RPC_PM_QOS, 100);
+        ggmlhexagon_set_rpc_latency(ctx->ggmlop_handle, RPC_POLL_QOS, 1000);
         ggmlhexagon_init_rpcmempool(ctx);
     } else {
         GGMLHEXAGON_LOG_INFO("error 0x%x: failed to open domain %d(%s)", hexagon_error, domain_id,

From ce4abac4938be819a87efa4a38f0cae66156aa7a Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 2 Apr 2025 15:39:34 +0800
Subject: [PATCH 158/200] ggml-hexagon: make helper function
 ggmlhexagon_get_timestring() thread-safe

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index bd5fb6909ce08..6bccdf3da3fd1 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -837,16 +837,16 @@ static const char * ggmlhexagon_get_hwaccel_approach_name(int hwaccle_approach)
 static void ggmlhexagon_get_timestring(char * p_currenttime) {
 #if defined(__ANDROID__) || defined(__linux__)
     time_t n_seconds    = 0;
-    struct tm * p_tm    = nullptr;
+    struct tm now_time;
 
     if (nullptr == p_currenttime)
         return;
 
     time(&n_seconds);
-    p_tm = localtime(&n_seconds);
+    localtime_r(&n_seconds, &now_time);
     snprintf(p_currenttime, GGMLHEXAGON_TMPBUF_LEN, "%04d-%02d-%02d,%02d:%02d:%02d",
-             p_tm->tm_year + 1900, p_tm->tm_mon + 1, p_tm->tm_mday,
-             p_tm->tm_hour, p_tm->tm_min, p_tm->tm_sec);
+             now_time.tm_year + 1900, now_time.tm_mon + 1, now_time.tm_mday,
+             now_time.tm_hour, now_time.tm_min, now_time.tm_sec);
 #else
     //TODO: WoA
 #endif

From f0244a64d0f5f4f670f21d948fb44be1f9931f84 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 2 Apr 2025 16:48:16 +0800
Subject: [PATCH 159/200] ggml-hexagon: fix a typo in ggml-hexagon.cpp

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 6bccdf3da3fd1..d74b9abaf0755 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -11,7 +11,7 @@
  * section-2  internal troubleshooting function/class
  * section-3  helper function for WoA(Windows on ARM)
  * section-4  general helper function
- * section-5  QNN helper function
+ * section-5  QNN helper function/class
  * section-6  implementation of hwaccel approach through QNN: offload ggmlop to QNN
  * section-7  cDSP helper function
  * section-8  implementation of ggml-hexagon backend according to specification in ggml backend subsystem
@@ -1504,7 +1504,7 @@ static bool ggmlhexagon_check_valid_appcfg() {
 }
 
 // =================================================================================================
-//  section-5: QNN helper function
+//  section-5: QNN helper function/class
 // =================================================================================================
 //ensure every QNN tensor/opcfg name is unique, threadsafe is not required at the moment
 static void ggmlqnn_reset_idx() {
@@ -5982,7 +5982,7 @@ const char * ggml_backend_hexagon_get_devname(size_t dev_num) {
             return "HEXAGON_BACKEND_CDSP";
     }
 
-    //here is the trick: fall through for various scenarios
+    //here is the trick: fall back for various scenarios
     switch (dev_num) {
         case HEXAGON_BACKEND_QNNCPU:
             return "HEXAGON_BACKEND_QNN_CPU";

From 478bb298474751d99fff0ba53d2ee33b7b4ecc77 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 2 Apr 2025 18:45:22 +0800
Subject: [PATCH 160/200] ggml-hexagon: list all known todo and fixme tasks in
 ggml-hexagon.cpp

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 52 +++++++++++++++++---------
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index d74b9abaf0755..e99f392a25228 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -128,7 +128,6 @@
 // =================================================================================================
 class  qnn_instance;
 struct ggml_backend_hexagon_context;
-static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx);
 
 #if 0//def NDEBUG
 #define GGMLHEXAGON_DEBUG                               0
@@ -852,6 +851,7 @@ static void ggmlhexagon_get_timestring(char * p_currenttime) {
 #endif
 }
 
+static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx);
 static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * ctx) {
     char timestamp[GGMLHEXAGON_TMPBUF_LEN];
     memset(timestamp, 0, GGMLHEXAGON_TMPBUF_LEN);
@@ -1205,7 +1205,7 @@ static size_t ggmlhexagon_get_system_total_memory_in_bytes() {
 
     return pages * page_size;
 #else
-    //FIXME: Snapdragon based WoA(Windows on ARM)
+    //TODO: Snapdragon based WoA(Windows on ARM)
     MEMORYSTATUSEX statex;
     statex.dwLength = sizeof(statex);
     if (GlobalMemoryStatusEx(&statex)) {
@@ -1228,7 +1228,7 @@ static size_t ggmlhexagon_get_system_free_memory_in_bytes() {
 
     return avail_pages * page_size;
 #else
-    //FIXME: Snapdragon based WoA(Windows on ARM)
+    //TODO: Snapdragon based WoA(Windows on ARM)
     MEMORYSTATUSEX statex;
     statex.dwLength = sizeof(statex);
     if (GlobalMemoryStatusEx(&statex)) {
@@ -1561,7 +1561,7 @@ static char * ggmlqnn_strndup(const char * source, size_t maxlen) {
 #if defined(__ANDROID__) || defined(__linux__)
     return strndup(source, maxlen);
 #else
-    //FIXME:behaviour is not exactly same to Android&Linux
+    //TODO:behaviour is not exactly same to Android&Linux
     GGML_UNUSED(maxlen);
     return strdup(source);
 #endif
@@ -3163,7 +3163,7 @@ int qnn_instance::htp_init_perfinfra() {
     htp_perfinfra->createPowerConfigId(device_id, core_id, &power_configid);
     _qnn_htp_perfinfra      = htp_perfinfra;
     _qnn_htp_powerconfig_id = power_configid;
-    //FIXME:hardcode to 0 and 0 although it's correct
+    //TODO:hardcode to 0 and 0 although it's correct
     _qnn_htp_device_id      = device_id;
     _qnn_htp_core_id        = core_id;
 
@@ -3178,7 +3178,7 @@ void qnn_instance::htp_probe_rpc_meminfo() {
     for (size_t idx = 0; idx < probe_counts; idx++) {
         rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem_internal(probe_slots[idx] * SIZE_IN_MB, 4));
         if (nullptr == rpc_buffer) {
-            GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
+            GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MB) failure during probe rpc memory info, reason: %s\n", probe_slots[idx], strerror(errno));
             break;
         } else {
             candidate_size = probe_slots[idx];
@@ -4694,7 +4694,7 @@ static void ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
     for (size_t idx = 0; idx < probe_counts; idx++) {
         rpc_buffer = static_cast<uint8_t *>(rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB)));
         if (nullptr == rpc_buffer) {
-            GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MB) failure, %s\n", probe_slots[idx], strerror(errno));
+            GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MB) failure during probe rpc memory info, reason: %s\n", probe_slots[idx], strerror(errno));
             break;
         } else {
             candidate_size = probe_slots[idx];
@@ -4708,13 +4708,13 @@ static void ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
     GGMLHEXAGON_LOG_INFO("capacity of rpc memory %d MB", ctx->rpc_mempool_capacity / SIZE_IN_MB);
 
     if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
-        //FIXME: reasonable rpc memory pool size
+        //FIXME: reasonable rpc memory pool size through a better approach rather than hardcoded size
         ctx->rpc_mempool_len = 1024 * SIZE_IN_MB;
         if (ctx->rpc_mempool_len > ctx->rpc_mempool_capacity) {
             GGMLHEXAGON_LOG_WARN("rpc mempool is too big");
             return;
         }
-        //FIXME: use ion memory pool currently, it seems there is unknown bug with DMA memory pool
+        //FIXME: it seems there is unknown issue with DMA memory pool
         ctx->rpc_mempool = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS,
                                         ctx->rpc_mempool_len);
         if (nullptr == ctx->rpc_mempool) {
@@ -4831,11 +4831,11 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     if (nullptr == ctx)
         return 1;
     GGMLHEXAGON_LOG_INFO("init Hexagon DSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
-    if (nullptr != ctx->rpc_mempool) {
+    if (0 != ctx->ggmlop_handle) {
         GGMLHEXAGON_LOG_DEBUG("already init Hexagon DSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
         return 0;
     }
-    ctx->ggmlop_handle = -1;
+    ctx->ggmlop_handle = 0;
 
     if (-1 == domain_id) {
         if (nullptr != domain_type) {
@@ -4936,10 +4936,11 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     hexagon_error = ggmlop_dsp_open(ggmlop_domain_uri, &ctx->ggmlop_handle);
     if (AEE_SUCCESS == hexagon_error) {
         GGMLHEXAGON_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
-        GGMLHEXAGON_LOG_INFO("only support offload GGML_OP_ADD and GGML_OP_MUL_MAT to cDSP currently");
+        //FIXME: only support offload fp32 GGML_OP_MUL_MAT to cDSP
+        GGMLHEXAGON_LOG_INFO("only support offload fp32 GGML_OP_ADD and fp32 GGML_OP_MUL_MAT to cDSP currently");
         ggmlhexagon_probe_dspinfo(ctx);
         ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1);
-        ggmlhexagon_set_rpc_latency(ctx->ggmlop_handle, RPC_POLL_QOS, 1000);
+        ggmlhexagon_set_rpc_latency(ctx->ggmlop_handle, RPC_POLL_QOS, 100);
         ggmlhexagon_init_rpcmempool(ctx);
     } else {
         GGMLHEXAGON_LOG_INFO("error 0x%x: failed to open domain %d(%s)", hexagon_error, domain_id,
@@ -4988,6 +4989,11 @@ static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_
         return;
     }
 
+    //FIXME:try to fully understand the tech detail in qidl:
+    // qidl is a binary tool to generate some very complicated and hard-to customized bridge-layer codes
+    // between ARM-AP and cDSP. the mechanism in qidl/FastRPC is exactly similar to mechanism in TEE.
+    // try to find a better/efficient approach to exchange necessary data between ARM-AP side and cDSP side.
+    // manually modifying the important data structure ggml_tensor in ggml.h is not make-sense and not acceptable.
     dsptensor_0.data        = src0->data;
     dsptensor_0.data_len    = ggml_nbytes(src0);
     dsptensor_0.type        = src0->type;
@@ -5455,9 +5461,15 @@ static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer
 }
 
 static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
-
-    return (2 * (1 << 29));
+    struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(buft->context);
+    GGML_ASSERT(nullptr != ctx);
+    if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        GGML_ASSERT(ctx->rpc_mempool_len > (8 * SIZE_IN_MB));
+        return ctx->rpc_mempool_len - (8 * SIZE_IN_MB);
+    } else {
+        //TODO:this is an experimental value for LLM models
+        return (1024 * SIZE_IN_MB);
+    }
 }
 
 static bool ggml_backend_buft_is_hexagon(ggml_backend_buffer_type_t buft) {
@@ -5465,7 +5477,13 @@ static bool ggml_backend_buft_is_hexagon(ggml_backend_buffer_type_t buft) {
 }
 
 static bool ggml_backend_hexagon_buffer_is_host(ggml_backend_buffer_type_t buft) {
-    GGML_UNUSED(buft);
+    struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(buft->context);
+    GGML_ASSERT(nullptr != ctx);
+    if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        //FIXME: return false here is make sense in this scenario although this is not key-point at the moment
+        //       fix it after solving other urgent tasks
+        //return false;
+    }
     return true;
 }
 

From d914424bec3eb56ca25f080f8c5bf1891fc420c8 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 2 Apr 2025 22:00:47 +0800
Subject: [PATCH 161/200] ggml-hexagon: fix units MB -> MiB

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 30 +++++++++++++-------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index e99f392a25228..661a17aedd2ca 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -2290,7 +2290,7 @@ void * qnn_instance::alloc_rpcmem_internal(size_t bytes, size_t alignment) {
 
 void * qnn_instance::alloc_rpcmem(size_t bytes, size_t alignment) {
     if (_rpcmem_usage > (_rpcmem_capacity - (8 * SIZE_IN_MB))) { // reserve 8Mbytes in rpc mempool
-        GGMLHEXAGON_LOG_WARN("rpc mempool capacity: %d MB, usage: %d MB", _rpcmem_capacity / SIZE_IN_MB, _rpcmem_usage / SIZE_IN_MB);
+        GGMLHEXAGON_LOG_WARN("rpc mempool capacity: %d MiB, usage: %d MiB", _rpcmem_capacity / SIZE_IN_MB, _rpcmem_usage / SIZE_IN_MB);
         return nullptr;
     }
 
@@ -3178,7 +3178,7 @@ void qnn_instance::htp_probe_rpc_meminfo() {
     for (size_t idx = 0; idx < probe_counts; idx++) {
         rpc_buffer = static_cast<uint8_t *>(alloc_rpcmem_internal(probe_slots[idx] * SIZE_IN_MB, 4));
         if (nullptr == rpc_buffer) {
-            GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MB) failure during probe rpc memory info, reason: %s\n", probe_slots[idx], strerror(errno));
+            GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MiB) failure during probe rpc memory info, reason: %s\n", probe_slots[idx], strerror(errno));
             break;
         } else {
             candidate_size = probe_slots[idx];
@@ -3191,7 +3191,7 @@ void qnn_instance::htp_probe_rpc_meminfo() {
 
     free_rpcmem();
     _rpcmem_usage = 0;
-    GGMLHEXAGON_LOG_INFO("capacity of rpc ion memory %d MB\n", _rpcmem_capacity / SIZE_IN_MB);
+    GGMLHEXAGON_LOG_INFO("capacity of rpc ion memory %d MiB\n", _rpcmem_capacity / SIZE_IN_MB);
 }
 
 void qnn_instance::htp_print_info() {
@@ -3207,7 +3207,7 @@ void qnn_instance::htp_print_info() {
         QnnHtpDevice_Arch_t htp_arch = chipinfo.arch;
         GGMLHEXAGON_LOG_DEBUG("HTP_TYPE:%d(%s)", devinfo->devType,
                          (devinfo->devType == QNN_HTP_DEVICE_TYPE_ON_CHIP) ? "QNN_HTP_DEVICE_TYPE_ON_CHIP" : "QNN_HTP_DEVICE_TYPE_UNKNOWN");
-        GGMLHEXAGON_LOG_DEBUG("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MB，" \
+        GGMLHEXAGON_LOG_DEBUG("qualcomm soc_model:%d(%s), htp_arch:%d(%s), vtcm_size:%d MiB，" \
                              "dlbc_support:%d, signedpd_support:%d", \
                              chipinfo.socModel, ggmlhexagon_get_socmodel_desc(chipinfo.socModel), \
                              htp_arch, ggmlhexagon_get_htparch_desc(htp_arch), chipinfo.vtcmSize, \
@@ -4694,7 +4694,7 @@ static void ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
     for (size_t idx = 0; idx < probe_counts; idx++) {
         rpc_buffer = static_cast<uint8_t *>(rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB)));
         if (nullptr == rpc_buffer) {
-            GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MB) failure during probe rpc memory info, reason: %s\n", probe_slots[idx], strerror(errno));
+            GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MiB) failure during probe rpc memory info, reason: %s\n", probe_slots[idx], strerror(errno));
             break;
         } else {
             candidate_size = probe_slots[idx];
@@ -4703,9 +4703,9 @@ static void ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
         }
     }
     ctx->rpc_mempool_capacity = candidate_size * SIZE_IN_MB;
-    GGMLHEXAGON_LOG_DEBUG("rpc memory capacity %ld(%d M) for device %d",
+    GGMLHEXAGON_LOG_DEBUG("rpc memory capacity %ld(%d MiB) for device %d",
                           ctx->rpc_mempool_capacity, ctx->rpc_mempool_capacity / SIZE_IN_MB, ctx->device);
-    GGMLHEXAGON_LOG_INFO("capacity of rpc memory %d MB", ctx->rpc_mempool_capacity / SIZE_IN_MB);
+    GGMLHEXAGON_LOG_INFO("capacity of rpc memory %d MiB", ctx->rpc_mempool_capacity / SIZE_IN_MB);
 
     if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
         //FIXME: reasonable rpc memory pool size through a better approach rather than hardcoded size
@@ -4721,7 +4721,7 @@ static void ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
             GGMLHEXAGON_LOG_WARN("alloc rpc memorypool %d failed", ctx->rpc_mempool_len);
             return;
         } else {
-            GGMLHEXAGON_LOG_DEBUG("alloc rpc memorypool %p successfully %ld(%d M)",
+            GGMLHEXAGON_LOG_DEBUG("alloc rpc memorypool %p successfully %ld(%d MiB)",
                                   ctx->rpc_mempool, ctx->rpc_mempool_len,
                                   ctx->rpc_mempool_len / SIZE_IN_MB);
         }
@@ -4830,9 +4830,9 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
 
     if (nullptr == ctx)
         return 1;
-    GGMLHEXAGON_LOG_INFO("init Hexagon DSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
+    GGMLHEXAGON_LOG_INFO("init Hexagon cDSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
     if (0 != ctx->ggmlop_handle) {
-        GGMLHEXAGON_LOG_DEBUG("already init Hexagon DSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
+        GGMLHEXAGON_LOG_DEBUG("already init Hexagon cDSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
         return 0;
     }
     ctx->ggmlop_handle = 0;
@@ -4848,7 +4848,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
                     GGMLHEXAGON_LOG_DEBUG("API is not supported on this target so cannot get domains info from the device. falling back to legacy approach of using default domain id");
                     hexagon_error = ggmlhexagon_get_dsp_support(&domain_id);
                     if (hexagon_error != AEE_SUCCESS) {
-                        GGMLHEXAGON_LOG_DEBUG("error: 0x%x, defaulting to CDSP domain", hexagon_error);
+                        GGMLHEXAGON_LOG_DEBUG("error: 0x%x, defaulting to cDSP domain", hexagon_error);
                     }
                 } else if (hexagon_error != AEE_SUCCESS) {
                     GGMLHEXAGON_LOG_DEBUG("error in getting domains information");
@@ -4871,7 +4871,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
             GGMLHEXAGON_LOG_DEBUG("DSP domain is not provided, retrieving DSP information using Remote APIs");
             hexagon_error = ggmlhexagon_get_dsp_support(&domain_id);
             if (hexagon_error != AEE_SUCCESS) {
-                GGMLHEXAGON_LOG_DEBUG("error: 0x%x, defaulting to CDSP domain", hexagon_error);
+                GGMLHEXAGON_LOG_DEBUG("error: 0x%x, defaulting to cDSP domain", hexagon_error);
             }
         }
     }
@@ -5438,7 +5438,7 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
     if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
         GGML_ASSERT(size + ctx->rpc_mempool_usage <= ctx->rpc_mempool_len);
         buffer_ctx->buffer = (static_cast<char*>(ctx->rpc_mempool)) + ctx->rpc_mempool_usage;
-        GGMLHEXAGON_LOG_DEBUG("size %d(%d M), buffer_ctx->buffer %p", size, size / SIZE_IN_MB, buffer_ctx->buffer);
+        GGMLHEXAGON_LOG_DEBUG("size %d(%d MiB), buffer_ctx->buffer %p", size, size / SIZE_IN_MB, buffer_ctx->buffer);
         GGML_ASSERT(nullptr != buffer_ctx->buffer);
         ctx->rpc_mempool_usage += size_aligned;
     } else {
@@ -5610,8 +5610,8 @@ static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_
         }
         *total = rpc_ion_memsize;
         *free = (rpc_ion_memsize - rpc_ion_usage);
-        GGMLHEXAGON_LOG_DEBUG("rpc memsize %d M", rpc_ion_memsize / SIZE_IN_MB);
-        GGMLHEXAGON_LOG_DEBUG("rpc usage %d M\n\n", rpc_ion_usage / SIZE_IN_MB);
+        GGMLHEXAGON_LOG_DEBUG("rpc memsize %d MiB", rpc_ion_memsize / SIZE_IN_MB);
+        GGMLHEXAGON_LOG_DEBUG("rpc usage %d MiB\n\n", rpc_ion_usage / SIZE_IN_MB);
     }
 }
 

From 3033280c23a8dc63a60f8379fcbf5eb8746b89a4 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 3 Apr 2025 14:25:27 +0800
Subject: [PATCH 162/200] ggml-hexagon: try to make ggml-hexagon backend works
 fine in a standard Android APP

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 28 +++++++++++++-------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 661a17aedd2ca..abe6bc0e1c5c2 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1390,21 +1390,18 @@ static void * ggmlhexagon_type_trait(ggml_backend_hexagon_context * ctx, ggml_te
 static void ggmlhexagon_set_runtime_path(size_t device, const std::string & path) {
 #if defined(__ANDROID__)
     if ((HEXAGON_BACKEND_QNNNPU == device) || (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach)) {
-        if (0 == setenv("LD_LIBRARY_PATH",
-                        (path +
-                         ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
-                        1)) {
-            GGMLHEXAGON_LOG_DEBUG("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv successfully");
+        std::string lib_runtime_path = path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images";
+        if (0 == setenv("LD_LIBRARY_PATH", lib_runtime_path.c_str(), 1)) {
+            GGMLHEXAGON_LOG_DEBUG("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv %s successfully", lib_runtime_path.c_str());
         } else {
-            GGMLHEXAGON_LOG_ERROR("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv failure");
+            GGMLHEXAGON_LOG_ERROR("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv %s failure", lib_runtime_path.c_str());
         }
-        if (0 == setenv("ADSP_LIBRARY_PATH",
-                        (path +
-                         ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp").c_str(),
-                        1)) {
-            GGMLHEXAGON_LOG_DEBUG("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv successfully");
+
+        std::string adsp_runtime_path = path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp";
+        if (0 == setenv("ADSP_LIBRARY_PATH", adsp_runtime_path.c_str(), 1)) {
+            GGMLHEXAGON_LOG_DEBUG("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv %s successfully", adsp_runtime_path.c_str());
         } else {
-            GGMLHEXAGON_LOG_ERROR("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv failure");
+            GGMLHEXAGON_LOG_ERROR("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv %s failure", adsp_runtime_path.c_str());
         }
     } else {
         if (0 == setenv("LD_LIBRARY_PATH",
@@ -1469,6 +1466,11 @@ static void ggmlhexagon_load_cfg() {
     } else {
         g_hexagon_appcfg.precision_mode = 0;
     }
+
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        ggmlhexagon_set_runtime_path(HEXAGON_BACKEND_CDSP, g_hexagon_appcfg.runtimelib_path);
+    }
+
     initialized = true;
 }
 
@@ -6070,8 +6072,6 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * qnn_lib_pat
         return nullptr;
     }
 
-    std::string path = qnn_lib_path;
-    ggmlhexagon_set_runtime_path(device, path);
     if (nullptr != g_hexagon_mgr[device].backend) {
         GGMLHEXAGON_LOG_DEBUG("backend %d(%s) already loaded", device,
                          ggml_backend_hexagon_get_devname(device));

From 374f27e14b47226affeb8dd55f056fa7f6c1fe07 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 3 Apr 2025 18:55:51 +0800
Subject: [PATCH 163/200] ggml-hexagon: remove reduament code and make debug
 log more clear

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 81 +++++++++++++-------------
 1 file changed, 41 insertions(+), 40 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index abe6bc0e1c5c2..2c99cc663dafb 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -317,7 +317,7 @@ struct hexagon_appcfg_t {
     int enable_rpc_ion_mempool; // enable/disable rpc ion memory pool
     int enable_rpc_dma_mempool; // enable/disable rpc dma memory pool
     const char * cfgfilename;
-    const char * runtimelib_path;
+    const char * runtime_libpath;
     char ggml_hexagon_version[GGMLHEXAGON_TMPBUF_LEN];
 };
 
@@ -338,7 +338,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
         .cfgfilename            = "ggml-hexagon.cfg",
 #if defined(__ANDROID__)
 //Android command line program
-        .runtimelib_path        = "/data/local/tmp/",
+        .runtime_libpath        = "/data/local/tmp/",
 #elif defined(__linux__)
         .qnn_runtimelib_path    = "/tmp/",
 #elif defined(_WIN32)
@@ -989,7 +989,7 @@ class hexagon_appcfg {
         if (pos != len) str.erase(pos);
     }
 
-    void trim(std::string& str) {
+    void trim(std::string & str) {
         ltrim(str);
         rtrim(str);
     }
@@ -1392,16 +1392,16 @@ static void ggmlhexagon_set_runtime_path(size_t device, const std::string & path
     if ((HEXAGON_BACKEND_QNNNPU == device) || (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach)) {
         std::string lib_runtime_path = path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images";
         if (0 == setenv("LD_LIBRARY_PATH", lib_runtime_path.c_str(), 1)) {
-            GGMLHEXAGON_LOG_DEBUG("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv %s successfully", lib_runtime_path.c_str());
+            GGMLHEXAGON_LOG_DEBUG("setenv LD_LIBRARY_PATH %s successfully", lib_runtime_path.c_str());
         } else {
-            GGMLHEXAGON_LOG_ERROR("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv %s failure", lib_runtime_path.c_str());
+            GGMLHEXAGON_LOG_ERROR("setenv LD_LIBRARY_PATH %s failure", lib_runtime_path.c_str());
         }
 
         std::string adsp_runtime_path = path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp";
         if (0 == setenv("ADSP_LIBRARY_PATH", adsp_runtime_path.c_str(), 1)) {
-            GGMLHEXAGON_LOG_DEBUG("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv %s successfully", adsp_runtime_path.c_str());
+            GGMLHEXAGON_LOG_DEBUG("setenv ADSP_LIBRARY_PATH %s successfully", adsp_runtime_path.c_str());
         } else {
-            GGMLHEXAGON_LOG_ERROR("HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP setenv %s failure", adsp_runtime_path.c_str());
+            GGMLHEXAGON_LOG_ERROR("setenv ADSP_LIBRARY_PATH %s failure", adsp_runtime_path.c_str());
         }
     } else {
         if (0 == setenv("LD_LIBRARY_PATH",
@@ -1429,7 +1429,7 @@ static void ggmlhexagon_load_cfg() {
     memset(time_string, 0, GGMLHEXAGON_TMPBUF_LEN);
     ggmlhexagon_get_timestring(time_string);
     GGMLHEXAGON_LOG_DEBUG("program running start time:%s", time_string);
-    std::string cfg_filename = std::string(g_hexagon_appcfg.runtimelib_path) + std::string(g_hexagon_appcfg.cfgfilename);
+    std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename);
     GGMLHEXAGON_LOG_INFO("load hexagon appcfg from %s", cfg_filename.c_str());
     hexagon_appcfg qnncfg_instance;
     qnncfg_instance.load(cfg_filename);
@@ -1454,22 +1454,21 @@ static void ggmlhexagon_load_cfg() {
     qnncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32");
     qnncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 1);
     qnncfg_instance.get_intvalue("cdsp", "enable_rpc_dma_mempool", g_hexagon_appcfg.enable_rpc_dma_mempool, 0);
-    GGMLHEXAGON_LOG_INFO("ggml_hexagon_version=%s", ggml_hexagon_version.c_str());
-    memcpy(g_hexagon_appcfg.ggml_hexagon_version, ggml_hexagon_version.c_str(), strlen(ggml_hexagon_version.c_str()));
+    GGMLHEXAGON_LOG_INFO("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version);
+    GGMLHEXAGON_LOG_INFO("external ggml_hexagon_version=%s", ggml_hexagon_version.c_str());
     GGMLHEXAGON_LOG_INFO("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
                          ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
     GGMLHEXAGON_LOG_INFO("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend,
                          ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend));
-    GGMLHEXAGON_LOG_INFO("qnn runtime lib path=%s", g_hexagon_appcfg.runtimelib_path);
+    GGMLHEXAGON_LOG_INFO("runtime libpath=%s", g_hexagon_appcfg.runtime_libpath);
+
     if (precision_mode.find("fp16") != std::string::npos) {
         g_hexagon_appcfg.precision_mode = 1;
     } else {
         g_hexagon_appcfg.precision_mode = 0;
     }
 
-    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
-        ggmlhexagon_set_runtime_path(HEXAGON_BACKEND_CDSP, g_hexagon_appcfg.runtimelib_path);
-    }
+    ggmlhexagon_set_runtime_path(HEXAGON_BACKEND_CDSP, g_hexagon_appcfg.runtime_libpath);
 
     initialized = true;
 }
@@ -1477,6 +1476,14 @@ static void ggmlhexagon_load_cfg() {
 static bool ggmlhexagon_check_valid_appcfg() {
     bool is_valid_appcfg = true;
 
+    GGMLHEXAGON_LOG_DEBUG("user's specified hwaccel approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
+                          ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
+    GGMLHEXAGON_LOG_DEBUG("user's specified hexagon_backend=%d", g_hexagon_appcfg.hexagon_backend);
+    if (g_hexagon_appcfg.hexagon_backend >= GGML_HEXAGON_MAX_DEVICES) {
+        GGMLHEXAGON_LOG_INFO("using default ggml backend");
+        is_valid_appcfg = false;
+    }
+
     if (HWACCEL_QNN_SINGLEGRAPH == g_hexagon_appcfg.hwaccel_approach) {
         GGMLHEXAGON_LOG_INFO("HWACCEL_QNN_SINGLEGRAPH not supported");
         is_valid_appcfg = false;
@@ -2599,7 +2606,7 @@ int qnn_instance::load_system() {
     if (nullptr == _system_lib_handle) {
         GGMLHEXAGON_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
         //re-try with default path of QNN binary runtime lib
-        _lib_path = std::string(g_hexagon_appcfg.runtimelib_path);
+        _lib_path = std::string(g_hexagon_appcfg.runtime_libpath);
 #if !defined(__ANDROID__) && !defined(__linux__)
         system_lib_path = _lib_path + "QnnSystem.dll";
 #else
@@ -2881,7 +2888,7 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
     }
 
 #if defined(__ANDROID__) || defined(__linux__)
-    std::filesystem::path full_path(std::string(g_hexagon_appcfg.runtimelib_path) + "libcdsprpc.so");
+    std::filesystem::path full_path(std::string(g_hexagon_appcfg.runtime_libpath) + "libcdsprpc.so");
     full_path /= std::filesystem::path("libcdsprpc.so").filename();
     _rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
     if (nullptr == _rpc_lib_handle) {
@@ -5657,11 +5664,8 @@ static ggml_backend_t ggml_backend_hexagon_device_init_backend(ggml_backend_dev_
     GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
     size_t dev_index = 0;
 
-    //case-1: test-backend-ops or other similar scenairo: calling ggml_backend_dev_init(dev, reinterpret_cast<const char *>(i)) directly in user's code
+    //case-1: test-backend-ops or other similar scenario: calling ggml_backend_dev_init(dev, reinterpret_cast<const char *>(i)) directly in user's code
     ggmlhexagon_load_cfg();
-    GGMLHEXAGON_LOG_DEBUG("user's specified hexagon_backend in cfgfile = %d", g_hexagon_appcfg.hexagon_backend);
-    GGMLHEXAGON_LOG_DEBUG("user's sepcified qnn runtime lib path in cfgfile = %s", g_hexagon_appcfg.runtimelib_path);
-
     if (!ggmlhexagon_check_valid_appcfg()) {
         return nullptr;
     }
@@ -5681,7 +5685,7 @@ static ggml_backend_t ggml_backend_hexagon_device_init_backend(ggml_backend_dev_
         GGMLHEXAGON_LOG_INFO("program specified dev_index %d\n", dev_index);
     }
     GGMLHEXAGON_LOG_DEBUG("hexagon_backend=%d", dev_index);
-    ggml_backend_t hexagon_backend = ggml_backend_hexagon_init(dev_index, g_hexagon_appcfg.runtimelib_path);
+    ggml_backend_t hexagon_backend = ggml_backend_hexagon_init(dev_index, g_hexagon_appcfg.runtime_libpath);
     GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__);
 
     return hexagon_backend;
@@ -5872,8 +5876,11 @@ static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) {
 
 static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg) {
     GGML_UNUSED(reg);
-    if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
         GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP);
+        //here is the trick:
+        //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
+        //so return 1
         return 1;
     } else {
         return GGML_HEXAGON_MAX_DEVICES;
@@ -5925,16 +5932,6 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
 
     //case-2: normal scenario, such as llama-cli or UI applicaton
     ggmlhexagon_load_cfg();
-    GGMLHEXAGON_LOG_DEBUG("hwaccel approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
-                     ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
-    GGMLHEXAGON_LOG_DEBUG("user's specified hexagon_backend=%d", g_hexagon_appcfg.hexagon_backend);
-    GGMLHEXAGON_LOG_DEBUG("user's specified runtime lib path=%s", g_hexagon_appcfg.runtimelib_path);
-    if (g_hexagon_appcfg.hexagon_backend >= GGML_HEXAGON_MAX_DEVICES) {
-        GGMLHEXAGON_LOG_INFO("using default ggml backend");
-        GGMLHEXAGON_LOG_DEBUG("leave ggml_backend_hexagon_reg");
-        return nullptr;
-    }
-
     if (!ggmlhexagon_check_valid_appcfg()) {
         return nullptr;
     }
@@ -6050,28 +6047,32 @@ static qnn_instance * ggmlqnn_init_qnn_instance(size_t device, const char * qnn_
 /**
  *
  * @param device            0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU/HEXAGON_BACKEND_CDSP
- * @param qnn_lib_path      QNN binrary runtime library path, such as "/data/local/tmp/" on Android or specified in JNI layer
+ * @param runtime_libpath   binary runtime library path, such as "/data/local/tmp/" on Android or specified in user's code
  * @return
  */
-ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * qnn_lib_path) {
+ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * runtime_libpath) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
-    //case-3: calling ggml_backend_hexagon_init() directly in user's code
-    ggmlhexagon_load_cfg();
-
-    if (nullptr == qnn_lib_path)
+    if (nullptr == runtime_libpath)
         return nullptr;
 
+    //case-3: calling ggml_backend_hexagon_init() directly in user's code
+    ggmlhexagon_load_cfg();
     if (!ggmlhexagon_check_valid_appcfg()) {
         return nullptr;
     }
 
     GGMLHEXAGON_LOG_DEBUG("device %d", device);
-    GGMLHEXAGON_LOG_DEBUG("qnn_lib_path %s", qnn_lib_path);
+    GGMLHEXAGON_LOG_DEBUG("runtime libpath %s", runtime_libpath);
     if (device >= GGML_HEXAGON_MAX_DEVICES) {
         GGMLHEXAGON_LOG_ERROR("invalid device %d", device);
         return nullptr;
     }
 
+    if (0 != memcmp(runtime_libpath, g_hexagon_appcfg.runtime_libpath, strlen(g_hexagon_appcfg.runtime_libpath))) {
+        //re-setting runtime libpath
+        ggmlhexagon_set_runtime_path(device, runtime_libpath);
+    }
+
     if (nullptr != g_hexagon_mgr[device].backend) {
         GGMLHEXAGON_LOG_DEBUG("backend %d(%s) already loaded", device,
                          ggml_backend_hexagon_get_devname(device));
@@ -6081,7 +6082,7 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * qnn_lib_pat
 
     //don't initialize QNN when hwaccel approach is offload ggml op to Hexagon cDSP directly
     if (HWACCEL_CDSP != g_hexagon_appcfg.hwaccel_approach) {
-        qnn_instance * instance = ggmlqnn_init_qnn_instance(device, qnn_lib_path);
+        qnn_instance * instance = ggmlqnn_init_qnn_instance(device, runtime_libpath);
         if (nullptr == instance)
             return nullptr;
     }

From bb2c6637c8dbe9b4d6d1735b39d40a088b9822dc Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 3 Apr 2025 21:17:37 +0800
Subject: [PATCH 164/200] ggml-hexagon: add gemma-3-4b-it-Q8_0.gguf to verify
 q8_0 mulmat on cDSP

---
 scripts/build-run-android.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index d970fcfb8a18f..9eeec4df7bd43 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -8,6 +8,7 @@ ANDROID_PLATFORM=android-34
 ANDROID_NDK=${PWD}/android-ndk-r26c
 REMOTE_PATH=/data/local/tmp/
 GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
+GGUF_MODEL_NAME=/sdcard/gemma-3-4b-it-Q8_0.gguf
 
 #QNN SDK could be found at:
 #https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk

From 95d8ea1a772d4a0fbf7612dfb596d782e80b691b Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 3 Apr 2025 23:26:41 +0800
Subject: [PATCH 165/200] ggml-hexagon:add skeleton code of offload
 GGML_OP_SOFT_MAX/GGML_OP_RMS_NORM/GGML_OP_POOL_2D to Hexagon cDSP

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp        |  43 ++-
 .../src/ggml-hexagon/kernels/ggmlop_ap_skel.c |  74 ++--
 .../src/ggml-hexagon/kernels/ggmlop_ap_skel.h |   7 +-
 ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c   | 331 +++---------------
 .../ggml-hexagon/kernels/ggmlop_cdsp_skel.c   |  66 ++--
 scripts/build-run-android.sh                  |   2 +-
 6 files changed, 151 insertions(+), 372 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 2c99cc663dafb..c0835ab6e9a9f 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -598,9 +598,9 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
         {true,  GGML_OP_ADD, 2, "ggmlop_dsp_add", ggmlop_dsp_add},
         {false, GGML_OP_ADD1, 0, nullptr, nullptr},
         {false, GGML_OP_ACC, 0, nullptr, nullptr},
-        {true,  GGML_OP_SUB, 2, "ggmlop_dsp_sub", ggmlop_dsp_sub},
-        {true,  GGML_OP_MUL, 2, "ggmlop_dsp_mul", ggmlop_dsp_mul},
-        {true,  GGML_OP_DIV, 2, "ggmlop_dsp_div", ggmlop_dsp_div},
+        {false,  GGML_OP_SUB, 2, nullptr, nullptr},
+        {false,  GGML_OP_MUL, 2, nullptr, nullptr},
+        {false,  GGML_OP_DIV, 2, nullptr, nullptr},
         {false, GGML_OP_SQR, 0, nullptr, nullptr},
         {false,  GGML_OP_SQRT, 0, nullptr, nullptr},
         {false,  GGML_OP_LOG, 0, nullptr, nullptr},
@@ -616,7 +616,7 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
         {false, GGML_OP_CONCAT, 0, nullptr, nullptr},
         {false, GGML_OP_SILU_BACK, 0, nullptr, nullptr},
         {false, GGML_OP_NORM, 0, nullptr, nullptr},
-        {false, GGML_OP_RMS_NORM, 0, nullptr, nullptr},
+        {true, GGML_OP_RMS_NORM, 1, "ggmlop_dsp_rmsnorm", ggmlop_dsp_rmsnorm},
         {false, GGML_OP_RMS_NORM_BACK, 0, nullptr, nullptr},
         {false, GGML_OP_GROUP_NORM, 0, nullptr, nullptr},
         {false, GGML_OP_L2_NORM, 0, nullptr, nullptr},
@@ -636,7 +636,7 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
         {false, GGML_OP_DIAG, 0, nullptr, nullptr},
         {false, GGML_OP_DIAG_MASK_INF, 0, nullptr, nullptr},
         {false, GGML_OP_DIAG_MASK_ZERO, 0, nullptr, nullptr},
-        {false, GGML_OP_SOFT_MAX, 0, nullptr, nullptr},
+        {true, GGML_OP_SOFT_MAX, 1, "ggmlop_dsp_softmax", ggmlop_dsp_softmax},
         {false, GGML_OP_SOFT_MAX_BACK, 0, nullptr, nullptr},
         {false, GGML_OP_ROPE, 0, nullptr, nullptr},
         {false, GGML_OP_ROPE_BACK, 0, nullptr, nullptr},
@@ -646,7 +646,7 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
         {false, GGML_OP_IM2COL_BACK, 0, nullptr, nullptr},
         {false, GGML_OP_CONV_TRANSPOSE_2D, 0, nullptr, nullptr},
         {false, GGML_OP_POOL_1D, 0, nullptr, nullptr},
-        {false, GGML_OP_POOL_2D, 0, nullptr, nullptr},
+        {true, GGML_OP_POOL_2D, 1, "ggmlop_dsp_pool2d", ggmlop_dsp_pool2d},
         {false, GGML_OP_POOL_2D_BACK, 0, nullptr, nullptr},
         {false, GGML_OP_UPSCALE, 0, nullptr, nullptr},
         {false, GGML_OP_PAD, 0, nullptr, nullptr},
@@ -694,10 +694,10 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
         {false, static_cast<ggml_op>(GGML_UNARY_OP_EXP), 0, nullptr, nullptr}
 };
 
-static_assert(ggmlhexagon_k_op_caps[GGML_OP_NONE].supported,    "GGML_OP_NONE is not true");
-static_assert(ggmlhexagon_k_op_caps[GGML_OP_ADD].supported,     "GGML_OP_ADD is not true");
-static_assert(ggmlhexagon_k_op_caps[GGML_OP_MUL].supported,     "GGML_OP_MUL is not true");
-static_assert(ggmlhexagon_k_op_caps[GGML_OP_MUL_MAT].supported, "GGML_OP_MUL_MAT is not true");
+static_assert(ggmlhexagon_k_op_caps[GGML_OP_NONE].supported,     "GGML_OP_NONE is not true");
+static_assert(ggmlhexagon_k_op_caps[GGML_OP_ADD].supported,      "GGML_OP_ADD is not true");
+static_assert(ggmlhexagon_k_op_caps[GGML_OP_MUL_MAT].supported,  "GGML_OP_MUL_MAT is not true");
+static_assert(ggmlhexagon_k_op_caps[GGML_OP_SOFT_MAX].supported, "GGML_OP_SOFT_MAX is not true");
 static_assert(std::size(ggmlhexagon_k_op_caps) == (static_cast<size_t>(GGML_OP_COUNT) + static_cast<size_t>(GGML_UNARY_OP_COUNT)),
               "pls check ggmlhexagon_k_op_caps and ensure is corresponding to latest ggml.h");
 
@@ -5018,6 +5018,7 @@ static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_
     dsptensor_0.nb[3] = src0->nb[3];
 
     if (2 == input_tensor_count) {
+        GGML_ASSERT(nullptr != src1);
         dsptensor_1.data        = src1->data;
         dsptensor_1.type        = src1->type;
         dsptensor_1.data_len    = ggml_nbytes(src1);
@@ -5047,6 +5048,8 @@ static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_
     dsptensor_2.nb[2] = dst->nb[2];
     dsptensor_2.nb[3] = dst->nb[3];
 
+    memcpy(dsptensor_2.op_params, dst->op_params, GGML_MAX_OP_PARAMS / sizeof(int32_t));
+
     hexagon_error = op_func(ctx->ggmlop_handle, &dsptensor_0, &dsptensor_1, &dsptensor_2);
     if (AEE_SUCCESS != hexagon_error) {
         GGMLHEXAGON_LOG_WARN("ggmlop %s computation fail on cdsp", ggml_op_name(op->op));
@@ -5078,19 +5081,31 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
             if (!ggml_are_same_shape(src0, src1)) {
                 return false;
             }
-
-            //TODO: offload quantize GGML_OP_ADD to cDSP
-            return ggmlhexagon_same_types(ctx, op_tensor);
+            return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
         }
         case GGML_OP_MUL_MAT:
         {
             ggmlhexagon_dump_op_info(op_tensor);
             if (1 == g_hexagon_appcfg.enable_q_mulmat)
-                return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_Q6_K
+                return (src0->type == GGML_TYPE_F32
+                        || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
+                        || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
                        ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
             else
                 return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
         }
+        case GGML_OP_SOFT_MAX:{
+            if (!ggml_is_contiguous(op_tensor))
+                return false;
+            if (!ggml_are_same_shape(src0, op_tensor))
+                return false;
+        }
+        case GGML_OP_RMS_NORM:
+        case GGML_OP_POOL_2D:
+        {
+
+            ggmlhexagon_dump_op_info(op_tensor);
+        }
         default:
             break;
     }
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
index ce5f0ae383fb2..3efbabdb0f67d 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
+++ b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
@@ -272,17 +272,17 @@ struct Interface {
 #define __QAIC_SLIM_EXPORT
 #endif
 
-static const Type types[4];
-static const Type* const typeArrays[6] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[0]),&(types[2])};
-static const StructType structTypes[1] = {{0x6,&(typeArrays[0]),0x30,0x4,0x2c,0x4,0x4,0x4}};
-static const Type types[4] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x10,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x4}}, 8,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[3]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
-static const Parameter parameters[6] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4,0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}};
+static const Type types[5];
+static const Type* const typeArrays[7] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[2]),&(types[0]),&(types[3])};
+static const StructType structTypes[1] = {{0x7,&(typeArrays[0]),0x70,0x4,0x6c,0x4,0x4,0x4}};
+static const Type types[5] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x10,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x4}}, 8,0x4},{0x40,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x10}}, 8,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[4]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
+static const Parameter parameters[6] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4,0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}};
 static const Parameter* const parameterArrays[9] = {(&(parameters[4])),(&(parameters[4])),(&(parameters[5])),(&(parameters[3])),(&(parameters[3])),(&(parameters[3])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))};
-static const Method methods[4] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[6])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[8])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x1,0x0,0x0,0x0),0xc,0x0,3,3,(&(parameterArrays[3])),0x4,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0x64,0x2c,3,3,(&(parameterArrays[0])),0x4,0x4}};
+static const Method methods[4] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[6])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[8])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x1,0x0,0x0,0x0),0xc,0x0,3,3,(&(parameterArrays[3])),0x4,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0xe4,0x6c,3,3,(&(parameterArrays[0])),0x4,0x4}};
 static const Method* const methodArrays[8] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3])};
-static const char strings[146] = "dsp_setclocks\0dcvs_enable\0power_level\0dsp_mulmat\0dsp_div\0dsp_sub\0dsp_mul\0dsp_add\0latency\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0uri\0op\0nb\0ne\0h\0";
-static const uint16_t methodStrings[119] = {49,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,57,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,65,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,38,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,73,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,0,26,81,14,121,130,143,95,143};
-static const uint16_t methodStringsArrays[8] = {114,117,110,88,66,44,22,0};
+static const char strings[167] = "dsp_setclocks\0dsp_rmsnorm\0dsp_softmax\0dcvs_enable\0power_level\0dsp_pool2d\0dsp_mulmat\0op_params\0dsp_add\0latency\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0uri\0op\0nb\0ne\0h\0";
+static const uint16_t methodStrings[134] = {62,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,14,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,26,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,73,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,94,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,0,50,102,38,142,151,164,116,164};
+static const uint16_t methodStringsArrays[8] = {129,132,125,100,75,50,25,0};
 __QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {8,&(methodArrays[0]),0,0,&(methodStringsArrays [0]),methodStrings,strings};
 #endif //_GGMLOP_SLIM_H
 
@@ -315,7 +315,7 @@ __QAIC_STUB_EXPORT AEEResult __QAIC_STUB(ggmlop_dsp_setclocks)(remote_handle64 _
    uint32_t _mid = 2;
    return _stub_method(_handle, _mid, (uint32_t*)&power_level, (uint32_t*)&latency, (uint32_t*)&dcvs_enable);
 }
-static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
+static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
    int _nErr = 0;
    remote_arg* _praROutPostStart = _praROutPost;
    remote_arg** _ppraROutPostStart = _ppraROutPost;
@@ -324,11 +324,12 @@ static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _AT
    _COPY(_rout1, 0, _primROut, 4, 16);
    _COPY(_rout2, 0, _primROut, 20, 16);
    _COPY(_rout3, 0, _primROut, 36, 4);
-   _COPY(_rout4, 0, _primROut, 40, 4);
+   _COPY(_rout4, 0, _primROut, 40, 64);
+   _COPY(_rout5, 0, _primROut, 104, 4);
    _ppraROutPostStart[0] += (_praROutPost - _praROutPostStart) +1;
    return _nErr;
 }
-static __inline int _stub_pack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
+static __inline int _stub_pack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
    int _nErr = 0;
    remote_arg* _praInStart = _praIn;
    remote_arg** _ppraInStart = _ppraIn;
@@ -336,14 +337,14 @@ static __inline int _stub_pack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNU
    remote_arg** _ppraROutStart = _ppraROut;
    _ppraIn = &_praIn;
    _ppraROut = &_praROut;
-   _COPY(_primIn, 0, _rout5Len, 0, 4);
-   _praROut[0].buf.pv = _rout5[0];
-   _praROut[0].buf.nLen = (4 * _rout5Len[0]);
+   _COPY(_primIn, 0, _rout6Len, 0, 4);
+   _praROut[0].buf.pv = _rout6[0];
+   _praROut[0].buf.nLen = (4 * _rout6Len[0]);
    _ppraInStart[0] += (_praIn - _praInStart) + 0;
    _ppraROutStart[0] += (_praROut - _praROutStart) +1;
    return _nErr;
 }
-static __inline int _stub_pack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[1], _ATTRIBUTE_UNUSED char* _in5[1], _ATTRIBUTE_UNUSED uint32_t _in5Len[1]) {
+static __inline int _stub_pack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[16], _ATTRIBUTE_UNUSED uint32_t _in5[1], _ATTRIBUTE_UNUSED char* _in6[1], _ATTRIBUTE_UNUSED uint32_t _in6Len[1]) {
    int _nErr = 0;
    remote_arg* _praInStart = _praIn;
    remote_arg** _ppraInStart = _ppraIn;
@@ -355,35 +356,36 @@ static __inline int _stub_pack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_U
    _COPY(_primIn, 4, _in1, 0, 16);
    _COPY(_primIn, 20, _in2, 0, 16);
    _COPY(_primIn, 36, _in3, 0, 4);
-   _COPY(_primIn, 40, _in4, 0, 4);
-   _COPY(_primIn, 44, _in5Len, 0, 4);
-   _praIn[0].buf.pv = (void*) _in5[0];
-   _praIn[0].buf.nLen = (4 * _in5Len[0]);
+   _COPY(_primIn, 40, _in4, 0, 64);
+   _COPY(_primIn, 104, _in5, 0, 4);
+   _COPY(_primIn, 108, _in6Len, 0, 4);
+   _praIn[0].buf.pv = (void*) _in6[0];
+   _praIn[0].buf.nLen = (4 * _in6Len[0]);
    _ppraInStart[0] += (_praIn - _praInStart) + 1;
    _ppraROutStart[0] += (_praROut - _praROutStart) +0;
    return _nErr;
 }
-static __inline void _count(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
+static __inline void _count(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
    _numIn[0] += 0;
    _numROut[0] += 1;
    _numInH[0] += 0;
    _numROutH[0] += 0;
 }
-static __inline void _count_1(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[1], _ATTRIBUTE_UNUSED char* _in5[1], _ATTRIBUTE_UNUSED uint32_t _in5Len[1]) {
+static __inline void _count_1(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[16], _ATTRIBUTE_UNUSED uint32_t _in5[1], _ATTRIBUTE_UNUSED char* _in6[1], _ATTRIBUTE_UNUSED uint32_t _in6Len[1]) {
    _numIn[0] += 1;
    _numROut[0] += 0;
    _numInH[0] += 0;
    _numROutH[0] += 0;
 }
-static __inline int _stub_method_1(remote_handle64 _handle, uint32_t _mid, uintptr_t _in0[SLIM_IFPTR32(13, 8)], uintptr_t _in1[SLIM_IFPTR32(13, 8)], uintptr_t _rout2[SLIM_IFPTR32(13, 8)]) {
+static __inline int _stub_method_1(remote_handle64 _handle, uint32_t _mid, uintptr_t _in0[SLIM_IFPTR32(29, 16)], uintptr_t _in1[SLIM_IFPTR32(29, 16)], uintptr_t _rout2[SLIM_IFPTR32(29, 16)]) {
    remote_arg* _pra = 0;
    int _numIn[1] = {0};
    int _numROut[1] = {0};
    int _numInH[1] = {0};
    int _numROutH[1] = {0};
    _allocator _al[1] = {{0}};
-   uint32_t _primIn[25]= {0};
-   uint32_t _primROut[11]= {0};
+   uint32_t _primIn[57]= {0};
+   uint32_t _primROut[27]= {0};
    remote_arg* _praIn = 0;
    remote_arg* _praROut = 0;
    remote_arg* _praROutPost = 0;
@@ -399,13 +401,15 @@ static __inline int _stub_method_1(remote_handle64 _handle, uint32_t _mid, uintp
    _numROut[0] = 0;
    _numInH[0] = 0;
    _numROutH[0] = 0;
-   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[11]), (char**)&(((uint64_t*)_in0)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[12]), (uint32_t*)&(((uint32_t*)_in0)[14])));
-   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[11]), (char**)&(((uint64_t*)_in1)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[12]), (uint32_t*)&(((uint32_t*)_in1)[14])));
-   _count(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[11]), (char**)&(((uint64_t*)_rout2)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[12]), (uint32_t*)&(((uint32_t*)_rout2)[14])));
+   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30])));
+   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30])));
+   _count(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30])));
    if(_numIn[0]>=255){
+          _QAIC_FARF(RUNTIME_ERROR, "ERROR: Unsupported number of input buffers\n");
           return AEE_EUNSUPPORTED;
    }
    if(_numROut[0]>=255){
+          _QAIC_FARF(RUNTIME_ERROR, "ERROR: Unsupported number of output buffers\n");
           return AEE_EUNSUPPORTED;
    }
    _allocator_init(_al, 0, 0);
@@ -424,13 +428,13 @@ static __inline int _stub_method_1(remote_handle64 _handle, uint32_t _mid, uintp
    }
    if(_praHROut == 0)
       (_praHROut = _praHIn + _numInH[0] + 0);
-   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[11]), (char**)&(((uint64_t*)_in0)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[12]), (uint32_t*)&(((uint32_t*)_in0)[14]))));
-   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 48), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[11]), (char**)&(((uint64_t*)_in1)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[12]), (uint32_t*)&(((uint32_t*)_in1)[14]))));
-   _TRY(_nErr, _stub_pack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 96), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[11]), (char**)&(((uint64_t*)_rout2)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[12]), (uint32_t*)&(((uint32_t*)_rout2)[14]))));
+   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30]))));
+   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 112), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30]))));
+   _TRY(_nErr, _stub_pack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 224), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))));
    _QAIC_ASSERT(_nErr, (_numInH[0] + 0) <= 15);
    _QAIC_ASSERT(_nErr, (_numROutH[0] + 0) <= 15);
    _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _pra));
-   _TRY(_nErr, _stub_unpack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[11]), (char**)&(((uint64_t*)_rout2)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[12]), (uint32_t*)&(((uint32_t*)_rout2)[14]))));
+   _TRY(_nErr, _stub_unpack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))));
    _QAIC_CATCH(_nErr) {}
    _CATCH_FARF(_nErr) {
       _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _mid, __func__);
@@ -446,15 +450,15 @@ __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_mulmat)(remote_handle64 _handle, c
    uint32_t _mid = 4;
    return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
 }
-__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_mul)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_softmax)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
    uint32_t _mid = 5;
    return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
 }
-__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_sub)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_rmsnorm)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
    uint32_t _mid = 6;
    return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
 }
-__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_div)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
+__QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_pool2d)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
    uint32_t _mid = 7;
    return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
 }
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h
index e30f833f06d17..b048988410dc8 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h
+++ b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h
@@ -237,6 +237,7 @@ struct dsptensor {
    int32_t ne[4];
    int32_t nb[4];
    int32_t op;
+   int32_t op_params[16];
    int32_t flags;
    void * data;
    int data_len;
@@ -274,9 +275,9 @@ __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_close)(remote_handle64 h) __QA
 __QAIC_HEADER_EXPORT AEEResult __QAIC_HEADER(ggmlop_dsp_setclocks)(remote_handle64 _h, int32 power_level, int32 latency, int32 dcvs_enable) __QAIC_HEADER_ATTRIBUTE;
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_add)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_mulmat)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_mul)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_sub)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_div)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_softmax)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_rmsnorm)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_pool2d)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
 #ifndef ggmlop_URI
 #define ggmlop_URI "file:///libggmlop_skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1"
 #endif /*ggmlop_URI*/
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
index 06a56475f3db5..a504acd8d89fe 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
@@ -604,6 +604,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
 
 };
 
+static struct ggml_compute_params params;
 // =================================================================================================
 //  section-2: ggml-hexagon kernel's internal troubleshooting function
 // =================================================================================================
@@ -908,6 +909,20 @@ static inline void ggml_init(void) {
         } u = {i};
         ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
     }
+
+    //FIXME:HVX multithreading should be utilized in hexagon-kernels
+    params.ith = 0;
+    params.nth = 1;
+    //FIXME:hardcode buffer size
+    params.wsize = 512 * 1024 * 1024;
+    params.wdata = (char*)malloc(params.wsize);
+    GGML_ASSERT(NULL != params.wdata);
+}
+
+static inline void ggml_deinit(void) {
+    free(params.wdata);
+    params.wdata = NULL;
+    params.wsize = 0;
 }
 
 static inline int nearest_int(float fval) {
@@ -1149,22 +1164,20 @@ static void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, co
 
 }
 
-static inline uint64 hexagon_perf_get_time_us(void)
-{
+static inline uint64 hexagon_perf_get_time_us(void) {
     unsigned long long count;
     asm volatile (" %0 = c31:30 " : "=r"(count));
     return (uint64)(count) * 10ull / 192ull;
 }
 
 static void ggml_time_init(void) {
-
 }
 
 static int64_t ggml_time_ms(void) {
     return hexagon_perf_get_time_us() * 1000;
 }
 
-int64_t ggml_time_us(void) {
+static int64_t ggml_time_us(void) {
     return hexagon_perf_get_time_us();
 }
 
@@ -1186,6 +1199,9 @@ int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
 int ggmlop_dsp_close(remote_handle64 handle) {
     if (handle)
         free((void*)handle);
+
+    ggml_deinit();
+
     return 0;
 }
 
@@ -1379,277 +1395,6 @@ int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tenso
     return 0;
 }
 
-static void ggml_compute_forward_sub_f32(
-        const ggml_tensor * src0,
-        const ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-
-    memcpy(dst->ne, src1->ne, 16);
-    memcpy(dst->nb, src1->nb, 16);
-
-    assert(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    const int ith = 0;
-    const int nth = 1;
-
-    const int nr  = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    if (nb10 == sizeof(float)) {
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-            const int64_t nr0 = ne00 / ne10;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
-            for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                vDSP_vsub(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
-#else
-                ggml_vec_sub_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
-            }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-            for (int64_t i0 = 0; i0 < ne0; ++i0) {
-                const int64_t i10 = i0 % ne10;
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] - *src1_ptr;
-            }
-        }
-    }
-}
-int ggmlop_dsp_sub(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-        {
-            ggml_compute_forward_sub_f32(src0, src1, dst);
-        } break;
-        default:
-        {
-            GGML_ABORT("fatal error");
-        }
-    }
-    return 0;
-}
-
-static void ggml_compute_forward_mul_f32(
-        const ggml_tensor * src0,
-        const ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-
-    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
-    memcpy(dst->ne, src1->ne, 16);
-    memcpy(dst->nb, src1->nb, 16);
-
-
-    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    const int ith = 0;
-    const int nth = 1;
-
-    const int64_t nr = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    if (nb10 == sizeof(float)) {
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-            const int64_t nr0 = ne00 / ne10;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
-            for (int64_t r = 0 ; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                UNUSED(ggml_vec_mul_f32);
-
-                vDSP_vmul(src0_ptr + r*ne10, 1, src1_ptr, 1, dst_ptr + r*ne10, 1, ne10);
-#else
-                ggml_vec_mul_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
-            }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-            for (int64_t i0 = 0; i0 < ne00; ++i0) {
-                const int64_t i10 = i0 % ne10;
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] * (*src1_ptr);
-            }
-        }
-    }
-}
-
-int ggmlop_dsp_mul(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-        {
-            if (src1->type == GGML_TYPE_F32) {
-                ggml_compute_forward_mul_f32(src0, src1, dst);
-            } else {
-                GGML_ABORT("fatal error");
-            }
-            break;
-        }
-        default:
-        {
-            GGML_ABORT("fatal error");
-        }
-    }
-    GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__);
-    return 0;
-}
-static void ggml_compute_forward_div_f32(
-        const ggml_tensor * src0,
-        const ggml_tensor * src1,
-        struct ggml_tensor * dst) {
-
-    memcpy(dst->ne, src1->ne, 16);
-    memcpy(dst->nb, src1->nb, 16);
-
-    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    const int ith = 0;
-    const int nth = 1;
-
-    const int64_t nr = ggml_nrows(src0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    if (nb10 == sizeof(float)) {
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-            const int64_t nr0 = ne00 / ne10;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-
-            for (int64_t r = 0; r < nr0; ++r) {
-#ifdef GGML_USE_ACCELERATE
-                UNUSED(ggml_vec_div_f32);
-
-                vDSP_vdiv(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
-#else
-                ggml_vec_div_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-#endif
-            }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int64_t ir = ith; ir < nr; ir += nth) {
-            // src0 and dst are same shape => same indices
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int64_t i03 = ir/(ne02*ne01);
-            const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int64_t i13 = i03 % ne13;
-            const int64_t i12 = i02 % ne12;
-            const int64_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-            for (int64_t i0 = 0; i0 < ne00; ++i0) {
-                const int64_t i10 = i0 % ne10;
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] / (*src1_ptr);
-            }
-        }
-    }
-}
-
-int ggmlop_dsp_div(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-        {
-            ggml_compute_forward_div_f32(src0, src1, dst);
-        } break;
-
-        default:
-        {
-            GGML_ABORT("fatal error");
-        }
-    }
-
-    return 0;
-}
-
 static void ggml_compute_forward_mul_mat_one_chunk(
         const struct ggml_compute_params * params,
         const ggml_tensor * src0,
@@ -1750,6 +1495,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
     }
 }
 
+//FIXME: only support fp32 mulmat on cDSP
 int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
     ggmlhexagon_dump_tensor(src0, 0);
@@ -1775,12 +1521,6 @@ int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_te
     const int ith = 0;
     const int nth = 1;
 
-    struct ggml_compute_params params;
-    params.ith = 0;
-    params.nth = 1;
-    params.wsize = 0;
-    params.wdata = NULL;
-
     GGML_ASSERT(ne0 == ne01);
     GGML_ASSERT(ne1 == ne11);
     GGML_ASSERT(ne2 == ne12);
@@ -1822,8 +1562,8 @@ int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_te
 #endif
 
     if (src1->type != vec_dot_type) {
-        params.wsize = ggml_row_size(vec_dot_type, ggml_nelements(src1));
-        params.wdata = (char*)malloc(params.wsize);
+        size_t wsize = ggml_row_size(vec_dot_type, ggml_nelements(src1));
+        GGML_ASSERT(wsize < params.wsize);
     }
 
     if (src1->type != vec_dot_type) {
@@ -1851,7 +1591,6 @@ int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_te
         }
     }
 
-
     // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
     const int32_t nr0 = ne0;
 
@@ -1913,10 +1652,28 @@ int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_te
         }
         current_chunk++;
     }
-    if (src1->type != vec_dot_type) {
-        free(params.wdata);
-    }
 
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
     return 0;
 }
+
+int ggmlop_dsp_softmax(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
+
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+int ggmlop_dsp_rmsnorm(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+
+    return 0;
+}
+
+int ggmlop_dsp_pool2d(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
+
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c
index f2660e0a518d1..7cce9d050f3fb 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c
+++ b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c
@@ -272,17 +272,17 @@ struct Interface {
 #define __QAIC_SLIM_EXPORT
 #endif
 
-static const Type types[4];
-static const Type* const typeArrays[6] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[0]),&(types[2])};
-static const StructType structTypes[1] = {{0x6,&(typeArrays[0]),0x30,0x4,0x2c,0x4,0x4,0x4}};
-static const Type types[4] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x10,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x4}}, 8,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[3]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
-static const Parameter parameters[6] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4,0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x34,0x40),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}};
+static const Type types[5];
+static const Type* const typeArrays[7] = {&(types[0]),&(types[1]),&(types[1]),&(types[0]),&(types[2]),&(types[0]),&(types[3])};
+static const StructType structTypes[1] = {{0x7,&(typeArrays[0]),0x70,0x4,0x6c,0x4,0x4,0x4}};
+static const Type types[5] = {{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4},{0x10,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x4}}, 8,0x4},{0x40,{{(const uintptr_t)&(types[0]),(const uintptr_t)0x10}}, 8,0x4},{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)&(types[4]),(const uintptr_t)0x0}}, 9,SLIM_IFPTR32(0x4,0x8)},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4}};
+static const Parameter parameters[6] = {{SLIM_IFPTR32(0x8,0x10),{{(const uintptr_t)0x0,0}}, 4,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),3,0},{SLIM_IFPTR32(0x4,0x8),{{(const uintptr_t)0xdeadc0de,(const uintptr_t)0}}, 0,SLIM_IFPTR32(0x4,0x8),0,0},{0x4,{{(const uintptr_t)0,(const uintptr_t)1}}, 2,0x4,0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),0,0},{SLIM_IFPTR32(0x74,0x80),{{(const uintptr_t)&(structTypes[0]),0}}, 22,SLIM_IFPTR32(0x4,0x8),3,0}};
 static const Parameter* const parameterArrays[9] = {(&(parameters[4])),(&(parameters[4])),(&(parameters[5])),(&(parameters[3])),(&(parameters[3])),(&(parameters[3])),(&(parameters[0])),(&(parameters[1])),(&(parameters[2]))};
-static const Method methods[4] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[6])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[8])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x1,0x0,0x0,0x0),0xc,0x0,3,3,(&(parameterArrays[3])),0x4,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0x64,0x2c,3,3,(&(parameterArrays[0])),0x4,0x4}};
+static const Method methods[4] = {{REMOTE_SCALARS_MAKEX(0,0,0x2,0x0,0x0,0x1),0x4,0x0,2,2,(&(parameterArrays[6])),0x4,0x1},{REMOTE_SCALARS_MAKEX(0,0,0x0,0x0,0x1,0x0),0x0,0x0,1,1,(&(parameterArrays[8])),0x1,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x1,0x0,0x0,0x0),0xc,0x0,3,3,(&(parameterArrays[3])),0x4,0x0},{REMOTE_SCALARS_MAKEX(0,0,0x3,0x2,0x0,0x0),0xe4,0x6c,3,3,(&(parameterArrays[0])),0x4,0x4}};
 static const Method* const methodArrays[8] = {&(methods[0]),&(methods[1]),&(methods[2]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3]),&(methods[3])};
-static const char strings[146] = "dsp_setclocks\0dcvs_enable\0power_level\0dsp_mulmat\0dsp_div\0dsp_sub\0dsp_mul\0dsp_add\0latency\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0uri\0op\0nb\0ne\0h\0";
-static const uint16_t methodStrings[119] = {49,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,57,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,65,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,38,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,73,116,111,140,137,134,89,106,101,111,140,137,134,89,106,126,111,140,137,134,89,106,0,26,81,14,121,130,143,95,143};
-static const uint16_t methodStringsArrays[8] = {114,117,110,88,66,44,22,0};
+static const char strings[167] = "dsp_setclocks\0dsp_rmsnorm\0dsp_softmax\0dcvs_enable\0power_level\0dsp_pool2d\0dsp_mulmat\0op_params\0dsp_add\0latency\0flags\0close\0src1\0data\0type\0src0\0open\0dst\0uri\0op\0nb\0ne\0h\0";
+static const uint16_t methodStrings[134] = {62,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,14,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,26,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,73,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,94,137,132,161,158,155,84,110,127,122,132,161,158,155,84,110,127,147,132,161,158,155,84,110,127,0,50,102,38,142,151,164,116,164};
+static const uint16_t methodStringsArrays[8] = {129,132,125,100,75,50,25,0};
 __QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {8,&(methodArrays[0]),0,0,&(methodStringsArrays [0]),methodStrings,strings};
 #endif //_GGMLOP_SLIM_H
 extern int adsp_mmap_fd_getinfo(int, uint32_t *);
@@ -291,7 +291,7 @@ extern "C" {
 #endif
 _ATTRIBUTE_VISIBILITY uint32_t ggmlop_skel_handle_invoke_qaic_version = 10048;
 _ATTRIBUTE_VISIBILITY char ggmlop_skel_handle_invoke_uri[77+1]="file:///libggmlop_skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1";
-static __inline int _skel_pack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
+static __inline int _skel_pack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
    int _nErr = 0;
    remote_arg* _praROutPostStart = _praROutPost;
    remote_arg** _ppraROutPostStart = _ppraROutPost;
@@ -300,11 +300,12 @@ static __inline int _skel_pack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTR
    _COPY(_primROut, 4, _rout1, 0, 16);
    _COPY(_primROut, 20, _rout2, 0, 16);
    _COPY(_primROut, 36, _rout3, 0, 4);
-   _COPY(_primROut, 40, _rout4, 0, 4);
+   _COPY(_primROut, 40, _rout4, 0, 64);
+   _COPY(_primROut, 104, _rout5, 0, 4);
    _ppraROutPostStart[0] += (_praROutPost - _praROutPostStart) +1;
    return _nErr;
 }
-static __inline int _skel_unpack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[1], _ATTRIBUTE_UNUSED char* _rout5[1], _ATTRIBUTE_UNUSED uint32_t _rout5Len[1]) {
+static __inline int _skel_unpack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
    int _nErr = 0;
    remote_arg* _praInStart = _praIn;
    remote_arg** _ppraInStart = _ppraIn;
@@ -312,15 +313,15 @@ static __inline int _skel_unpack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_U
    remote_arg** _ppraROutStart = _ppraROut;
    _ppraIn = &_praIn;
    _ppraROut = &_praROut;
-   _COPY(_rout5Len, 0, _primIn, 0, 4);
-   _QAIC_ASSERT(_nErr, ((_praROut[0].buf.nLen / 4)) >= (size_t)(_rout5Len[0]));
-   _rout5[0] = _praROut[0].buf.pv;
+   _COPY(_rout6Len, 0, _primIn, 0, 4);
+   _QAIC_ASSERT(_nErr, ((_praROut[0].buf.nLen / 4)) >= (size_t)(_rout6Len[0]));
+   _rout6[0] = _praROut[0].buf.pv;
    _ppraInStart[0] += (_praIn - _praInStart) + 0;
    _ppraROutStart[0] += (_praROut - _praROutStart) +1;
    _QAIC_CATCH(_nErr) {}
    return _nErr;
 }
-static __inline int _skel_unpack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[1], _ATTRIBUTE_UNUSED char* _in5[1], _ATTRIBUTE_UNUSED uint32_t _in5Len[1]) {
+static __inline int _skel_unpack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[16], _ATTRIBUTE_UNUSED uint32_t _in5[1], _ATTRIBUTE_UNUSED char* _in6[1], _ATTRIBUTE_UNUSED uint32_t _in6Len[1]) {
    int _nErr = 0;
    remote_arg* _praInStart = _praIn;
    remote_arg** _ppraInStart = _ppraIn;
@@ -332,10 +333,11 @@ static __inline int _skel_unpack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE
    _COPY(_in1, 0, _primIn, 4, 16);
    _COPY(_in2, 0, _primIn, 20, 16);
    _COPY(_in3, 0, _primIn, 36, 4);
-   _COPY(_in4, 0, _primIn, 40, 4);
-   _COPY(_in5Len, 0, _primIn, 44, 4);
-   _QAIC_ASSERT(_nErr, ((_praIn[0].buf.nLen / 4)) >= (size_t)(_in5Len[0]));
-   _in5[0] = _praIn[0].buf.pv;
+   _COPY(_in4, 0, _primIn, 40, 64);
+   _COPY(_in5, 0, _primIn, 104, 4);
+   _COPY(_in6Len, 0, _primIn, 108, 4);
+   _QAIC_ASSERT(_nErr, ((_praIn[0].buf.nLen / 4)) >= (size_t)(_in6Len[0]));
+   _in6[0] = _praIn[0].buf.pv;
    _ppraInStart[0] += (_praIn - _praInStart) + 1;
    _ppraROutStart[0] += (_praROut - _praROutStart) +0;
    _QAIC_CATCH(_nErr) {}
@@ -343,9 +345,9 @@ static __inline int _skel_unpack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE
 }
 static __inline int _skel_method(int (*_pfn)(remote_handle64, const dsptensor*, const dsptensor*, dsptensor*), remote_handle64 _h, uint32_t _sc, remote_arg* _pra) {
    remote_arg* _praEnd = 0;
-   uintptr_t _in0[SLIM_IFPTR32(13, 8)] = {0};
-   uintptr_t _in1[SLIM_IFPTR32(13, 8)] = {0};
-   uintptr_t _rout2[SLIM_IFPTR32(13, 8)] = {0};
+   uintptr_t _in0[SLIM_IFPTR32(29, 16)] = {0};
+   uintptr_t _in1[SLIM_IFPTR32(29, 16)] = {0};
+   uintptr_t _rout2[SLIM_IFPTR32(29, 16)] = {0};
    uint32_t* _primIn= 0;
    int _numIn[1] = {0};
    uint32_t* _primROut= 0;
@@ -370,9 +372,9 @@ static __inline int _skel_method(int (*_pfn)(remote_handle64, const dsptensor*,
    _QAIC_ASSERT(_nErr, REMOTE_SCALARS_OUTHANDLES(_sc)==0);
    _QAIC_ASSERT(_nErr, (_pra + ((1 + 1) + (((0 + 0) + 0) + 0))) <= _praEnd);
    _numIn[0] = (REMOTE_SCALARS_INBUFS(_sc) - 1);
-   _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 100);
+   _QAIC_ASSERT(_nErr, _pra[0].buf.nLen >= 228);
    _primIn = _pra[0].buf.pv;
-   _QAIC_ASSERT(_nErr, _pra[(_numIn[0] + 1)].buf.nLen >= 44);
+   _QAIC_ASSERT(_nErr, _pra[(_numIn[0] + 1)].buf.nLen >= 108);
    _primROut = _pra[(_numIn[0] + 1)].buf.pv;
    _numInH[0] = REMOTE_SCALARS_INHANDLES(_sc);
    _numROut[0] = REMOTE_SCALARS_OUTBUFS(_sc);
@@ -386,11 +388,11 @@ static __inline int _skel_method(int (*_pfn)(remote_handle64, const dsptensor*,
    }
    if(_praHROut == 0)
       (_praHROut = _praHIn + _numInH[0] + 0);
-   _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[11]), (char**)&(((uint64_t*)_in0)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[12]), (uint32_t*)&(((uint32_t*)_in0)[14]))));
-   _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 48), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[11]), (char**)&(((uint64_t*)_in1)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[12]), (uint32_t*)&(((uint32_t*)_in1)[14]))));
-   _TRY(_nErr, _skel_unpack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 96), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[11]), (char**)&(((uint64_t*)_rout2)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[12]), (uint32_t*)&(((uint32_t*)_rout2)[14]))));
+   _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30]))));
+   _TRY(_nErr, _skel_unpack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 112), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30]))));
+   _TRY(_nErr, _skel_unpack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 224), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))));
    _TRY(_nErr, _pfn(_h, (const dsptensor*)_in0, (const dsptensor*)_in1, (dsptensor*)_rout2));
-   _TRY(_nErr, _skel_pack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[11]), (char**)&(((uint64_t*)_rout2)[6])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[12]), (uint32_t*)&(((uint32_t*)_rout2)[14]))));
+   _TRY(_nErr, _skel_pack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))));
    _QAIC_CATCH(_nErr) {}
    _allocator_deinit(_al);
    return _nErr;
@@ -607,11 +609,11 @@ __QAIC_SKEL_EXPORT int __QAIC_SKEL(ggmlop_skel_handle_invoke)(remote_handle64 _h
       case 4:
       return _skel_method(__QAIC_IMPL(ggmlop_dsp_mulmat), _h, _sc, _pra);
       case 5:
-      return _skel_method(__QAIC_IMPL(ggmlop_dsp_mul), _h, _sc, _pra);
+      return _skel_method(__QAIC_IMPL(ggmlop_dsp_softmax), _h, _sc, _pra);
       case 6:
-      return _skel_method(__QAIC_IMPL(ggmlop_dsp_sub), _h, _sc, _pra);
+      return _skel_method(__QAIC_IMPL(ggmlop_dsp_rmsnorm), _h, _sc, _pra);
       case 7:
-      return _skel_method(__QAIC_IMPL(ggmlop_dsp_div), _h, _sc, _pra);
+      return _skel_method(__QAIC_IMPL(ggmlop_dsp_pool2d), _h, _sc, _pra);
    }
    return AEE_EUNSUPPORTED;
 }
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 9eeec4df7bd43..fe0b84b223f1c 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -7,8 +7,8 @@ PWD=`pwd`
 ANDROID_PLATFORM=android-34
 ANDROID_NDK=${PWD}/android-ndk-r26c
 REMOTE_PATH=/data/local/tmp/
-GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
 GGUF_MODEL_NAME=/sdcard/gemma-3-4b-it-Q8_0.gguf
+GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
 
 #QNN SDK could be found at:
 #https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk

From 507c5b279bb19b5dec5adf88aee615d8efc15d00 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Fri, 4 Apr 2025 23:44:30 +0800
Subject: [PATCH 166/200] ggml-hexagon: release ggml-dsp v0.60 on cDSP side

---
 ggml/src/ggml-hexagon/kernels/Makefile        |   2 +-
 .../kernels/{ggmlop_cdsp.c => ggml-dsp.c}     | 329 +-----------------
 ggml/src/ggml-hexagon/kernels/ggml-dsp.h      | 327 +++++++++++++++++
 scripts/ggml-hexagon.cfg                      |   4 +
 4 files changed, 342 insertions(+), 320 deletions(-)
 rename ggml/src/ggml-hexagon/kernels/{ggmlop_cdsp.c => ggml-dsp.c} (82%)
 create mode 100644 ggml/src/ggml-hexagon/kernels/ggml-dsp.h

diff --git a/ggml/src/ggml-hexagon/kernels/Makefile b/ggml/src/ggml-hexagon/kernels/Makefile
index 0a58333a5f895..1a39011a02664 100755
--- a/ggml/src/ggml-hexagon/kernels/Makefile
+++ b/ggml/src/ggml-hexagon/kernels/Makefile
@@ -9,7 +9,7 @@ CFLAGS=-m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initi
 
 LDFLAGS=-m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${TARGET}
 
-SRCS = ggmlop_cdsp.c  ggmlop_cdsp_skel.c
+SRCS = ggml-dsp.c  ggmlop_cdsp_skel.c
 OBJS = $(patsubst %.c, %.o, $(SRCS))
 
 ALL:$(OBJS)
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
similarity index 82%
rename from ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
rename to ggml/src/ggml-hexagon/kernels/ggml-dsp.c
index a504acd8d89fe..43b7017f5abf4 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -37,335 +37,26 @@
 #include "hexagon_protos.h"
 
 #include "ggmlop_ap_skel.h"
+#include "ggml-dsp.h"
 
 // =================================================================================================
 //  section-1: forward/prototype declaration,global vars,macros,data structures
 // =================================================================================================
 #define ggml_tensor         dsptensor
 
-#define GGML_MAX_DIMS       4
-
-#define ALIGN_128_BYTE      128
-
-#define GGML_UNUSED(x)      (void)(x)
-
-#define UNUSED              GGML_UNUSED
-
-#define GGML_PAD(x, n)      (((x) + (n) - 1) & ~((n) - 1))
-
-#define GGML_ABORT(...)     ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
-
-#define GGML_ASSERT(x)      if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
-
-#define MIN(a, b)           ((a) < (b) ? (a) : (b))
-#define MAX(a, b)           ((a) > (b) ? (a) : (b))
-
-#if UINTPTR_MAX == 0xFFFFFFFF
-#define GGML_MEM_ALIGN      4
-#else
-#define GGML_MEM_ALIGN      16
-#endif
-
-#define GGML_RESTRICT
-
-#define static_assert(a, b) do { } while (0)
-
-#define GROUP_MAX_EPS 1e-15f
-
-// QK = number of values after dequantization
-// QK_K = super-block size
-#define QK_K 256
-#define K_SCALE_SIZE 12
-
-#define GGML_COMPUTE_FP16_TO_FP32(x)    ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x)    ggml_compute_fp32_to_fp16(x)
-#define GGML_FP32_TO_FP16(x)            GGML_COMPUTE_FP32_TO_FP16(x)
-#define GGML_FP16_TO_FP32(x)            ggml_lookup_fp16_to_fp32(x)
-
-#if 0//def NDEBUG
-#define GGMLQNN_DEBUG                                       0
-#else
-#define GGMLQNN_DEBUG                                       1
-#endif
-
-#define GGMLHEXAGON_LOGBUF_LEN                              4096
-#define GGML_QNN_TMPBUF_LEN                                 256
-#if GGMLQNN_DEBUG
-#define GGMLHEXAGON_LOG_DEBUG(...)                          ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
-#else
-#define GGMLHEXAGON_LOG_DEBUG(...)
-#endif
-#define GGMLQNN_DUMP_TENSOR(tensor)                         ggmlhexagon_dump_tensor(tensor, #tensor)
-
-#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
-    const type prefix##0 = (pointer)->array[0]; \
-    GGML_UNUSED(prefix##0);
-#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
-    GGML_TENSOR_LOCALS_1    (type, prefix, pointer, array) \
-    const type prefix##1 = (pointer)->array[1]; \
-    GGML_UNUSED(prefix##1);
-#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
-    GGML_TENSOR_LOCALS_2    (type, prefix, pointer, array) \
-    const type prefix##2 = (pointer)->array[2]; \
-    GGML_UNUSED(prefix##2);
-#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
-    GGML_TENSOR_LOCALS_3  (type, prefix, pointer, array) \
-    const type prefix##3 = (pointer)->array[3]; \
-    GGML_UNUSED(prefix##3);
-
-#define GGML_TENSOR_UNARY_OP_LOCALS \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-
-#define GGML_TENSOR_BINARY_OP_LOCALS \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
-
-#define GGML_TENSOR_BINARY_OP_LOCALS01 \
-    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
-    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
-    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
-
-enum ggmlhexagon_log_level {
-    GGMLHEXAGON_LOG_LEVEL_NONE  = 0,
-    GGMLHEXAGON_LOG_LEVEL_DEBUG = 1,
-    GGMLHEXAGON_LOG_LEVEL_INFO  = 2,
-    GGMLHEXAGON_LOG_LEVEL_WARN  = 3,
-    GGMLHEXAGON_LOG_LEVEL_ERROR = 4,
-    GGMLHEXAGON_LOG_LEVEL_CONT  = 5,
-};
-
-enum ggml_type {
-    GGML_TYPE_F32     = 0,
-    GGML_TYPE_F16     = 1,
-    GGML_TYPE_Q4_0    = 2,
-    GGML_TYPE_Q4_1    = 3,
-    // GGML_TYPE_Q4_2 = 4, support has been removed
-    // GGML_TYPE_Q4_3 = 5, support has been removed
-    GGML_TYPE_Q5_0    = 6,
-    GGML_TYPE_Q5_1    = 7,
-    GGML_TYPE_Q8_0    = 8,
-    GGML_TYPE_Q8_1    = 9,
-    GGML_TYPE_Q2_K    = 10,
-    GGML_TYPE_Q3_K    = 11,
-    GGML_TYPE_Q4_K    = 12,
-    GGML_TYPE_Q5_K    = 13,
-    GGML_TYPE_Q6_K    = 14,
-    GGML_TYPE_Q8_K    = 15,
-    GGML_TYPE_IQ2_XXS = 16,
-    GGML_TYPE_IQ2_XS  = 17,
-    GGML_TYPE_IQ3_XXS = 18,
-    GGML_TYPE_IQ1_S   = 19,
-    GGML_TYPE_IQ4_NL  = 20,
-    GGML_TYPE_IQ3_S   = 21,
-    GGML_TYPE_IQ2_S   = 22,
-    GGML_TYPE_IQ4_XS  = 23,
-    GGML_TYPE_I8      = 24,
-    GGML_TYPE_I16     = 25,
-    GGML_TYPE_I32     = 26,
-    GGML_TYPE_I64     = 27,
-    GGML_TYPE_F64     = 28,
-    GGML_TYPE_IQ1_M   = 29,
-    GGML_TYPE_BF16    = 30,
-    // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
-    // GGML_TYPE_Q4_0_4_8 = 32,
-    // GGML_TYPE_Q4_0_8_8 = 33,
-    GGML_TYPE_TQ1_0   = 34,
-    GGML_TYPE_TQ2_0   = 35,
-    // GGML_TYPE_IQ4_NL_4_4 = 36,
-    // GGML_TYPE_IQ4_NL_4_8 = 37,
-    // GGML_TYPE_IQ4_NL_8_8 = 38,
-    GGML_TYPE_COUNT   = 39,
-};
-
-typedef double      ggml_float;
-typedef uint16_t    ggml_fp16_t;
-typedef uint16_t    ggml_half;
-typedef uint32_t    ggml_half2;
-typedef void        (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
-                                        const void * GGML_RESTRICT y, size_t by, int nrc);
-typedef void        (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
-
-typedef void        (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-typedef void        (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
-
-struct ggml_compute_params {
-    // ith = thread index, nth = number of threads
-    int ith, nth;
-
-    // work buffer for all threads
-    size_t wsize;
-    void * wdata;
-};
-
-#define QK4_0 32
-typedef struct {
-    ggml_half d;           // delta
-    uint8_t qs[QK4_0 / 2]; // nibbles / quants
-} block_q4_0;
-
-#define QK4_1 32
-typedef struct {
-    union {
-        struct {
-            ggml_half d; // delta
-            ggml_half m; // min
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 dm;
-    } GGML_COMMON_AGGR_U;
-    uint8_t qs[QK4_1 / 2]; // nibbles / quants
-} block_q4_1;
-
-#define QK5_0 32
-typedef struct {
-    ggml_half d;           // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-
-#define QK5_1 32
-typedef struct {
-    union {
-        struct {
-            ggml_half d; // delta
-            ggml_half m; // min
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 dm;
-    } GGML_COMMON_AGGR_U;
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2]; // nibbles / quants
-} block_q5_1;
-
-#define QK8_0 32
-typedef struct {
-    ggml_half d;       // delta
-    int8_t  qs[QK8_0]; // quants
-} block_q8_0;
-
-#define QK8_1 32
-typedef struct {
-    union {
-        struct {
-            ggml_half d; // delta
-            ggml_half s; // d * sum(qs[i])
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 ds;
-    } GGML_COMMON_AGGR_U;
-    int8_t qs[QK8_1]; // quants
-} block_q8_1;
-
-// 2-bit quantization
-// weight is represented as x = a * q + b
-// 16 blocks of 16 elements each
-// Effectively 2.625 bits per weight
-typedef struct {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    union {
-        struct {
-            ggml_half d;    // super-block scale for quantized scales
-            ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 dm;
-    } GGML_COMMON_AGGR_U;
-} block_q2_K;
-
-// 3-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 3.4375 bits per weight
-typedef struct {
-    uint8_t hmask[QK_K/8]; // quants - high bit
-    uint8_t qs[QK_K/4];    // quants - low 2 bits
-    uint8_t scales[12];    // scales, quantized with 6 bits
-    ggml_half d;           // super-block scale
-} block_q3_K;
-
-// 4-bit quantization
-// 8 blocks of 32 elements each
-// weight is represented as x = a * q + b
-// Effectively 4.5 bits per weight
-typedef struct {
-    union {
-        struct {
-            ggml_half d;    // super-block scale for quantized scales
-            ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 dm;
-    } GGML_COMMON_AGGR_U;
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];           // 4--bit quants
-} block_q4_K;
-
-// 5-bit quantization
-// 8 blocks of 32 elements each
-// weight is represented as x = a * q + b
-// Effectively 5.5 bits per weight
-typedef struct {
-    union {
-        struct {
-            ggml_half d;    // super-block scale for quantized scales
-            ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 dm;
-    } GGML_COMMON_AGGR_U;
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];           // quants, high bit
-    uint8_t qs[QK_K/2];           // quants, low 4 bits
-} block_q5_K;
-
-// 6-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 6.5625 bits per weight
-typedef struct {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    ggml_half d;             // super-block scale
-} block_q6_K;
-
-typedef struct {
-    float   d;              // delta
-    int8_t  qs[QK_K];       // quants
-    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
-} block_q8_K;
-
-struct ggml_type_traits {
-    const char             * type_name;
-    int64_t                  blck_size;
-    int64_t                  blck_size_interleave; // interleave elements in blocks
-    size_t                   type_size;
-    bool                     is_quantized;
-    ggml_to_float_t          to_float;
-    ggml_from_float_t        from_float_ref;
-};
-
-struct ggml_type_traits_cpu {
-    ggml_from_float_t        from_float;
-    ggml_vec_dot_t           vec_dot;
-    enum ggml_type           vec_dot_type;
-    int64_t                  nrows; // number of rows to process simultaneously
-};
-
 static size_t ggml_nbytes(const struct ggml_tensor * tensor);
 static void   ggmlhexagon_log_internal(int level, const char * file, const char * func, int line, const char * format, ...);
 static void   ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
 
-static void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-static void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
-static void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
-static void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
+static void   dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+static void   quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
+static void   quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
+static void   ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
 
 static float ggml_table_f32_f16[1 << 16];
 
+static struct ggml_compute_params params;
+
 static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         [GGML_TYPE_F32] = {
                 .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
@@ -604,7 +295,6 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
 
 };
 
-static struct ggml_compute_params params;
 // =================================================================================================
 //  section-2: ggml-hexagon kernel's internal troubleshooting function
 // =================================================================================================
@@ -659,7 +349,7 @@ static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_
 }
 
 // =================================================================================================
-//  section-3: tiny ggml-dsp(ggml on Hexagon cDSP, ported from original ggml)
+//  section-3: tiny ggml-dsp: a customized ggml on Hexagon cDSP, ported from original ggml
 // =================================================================================================
 static const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
     return &type_traits_cpu[type];
@@ -1254,7 +944,7 @@ AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32
 }
 
 // =================================================================================================
-//  section-5: ggml-hexagon kernel function: offload ggmlop to cDSP through Hexagon C API and SIMD instructions
+//  section-5: ggml-hexagon kernel functions: offload ggmlop to cDSP through Hexagon C API and SIMD instructions
 // =================================================================================================
 inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float * x, const float * y) {
     HVX_Vector * va;
@@ -1373,6 +1063,7 @@ static void ggml_compute_forward_add_f32(
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
 }
 
+//FIXME: failed with test-backend-ops when disable ion rpc mempool
 int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst)
 {
     GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
new file mode 100644
index 0000000000000..e7c5633cc93a9
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
@@ -0,0 +1,327 @@
+#pragma once
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+#define GGML_MAX_DIMS       4
+
+#define ALIGN_128_BYTE      128
+
+#define GGML_UNUSED(x)      (void)(x)
+
+#define UNUSED              GGML_UNUSED
+
+#define GGML_PAD(x, n)      (((x) + (n) - 1) & ~((n) - 1))
+
+#define GGML_ABORT(...)     ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
+
+#define GGML_ASSERT(x)      if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
+
+#define MIN(a, b)           ((a) < (b) ? (a) : (b))
+#define MAX(a, b)           ((a) > (b) ? (a) : (b))
+
+#if UINTPTR_MAX == 0xFFFFFFFF
+#define GGML_MEM_ALIGN      4
+#else
+#define GGML_MEM_ALIGN      16
+#endif
+
+#define GGML_RESTRICT
+
+#define static_assert(a, b) do { } while (0)
+
+#define GROUP_MAX_EPS 1e-15f
+
+// QK = number of values after dequantization
+// QK_K = super-block size
+#define QK_K 256
+#define K_SCALE_SIZE 12
+
+#define GGML_COMPUTE_FP16_TO_FP32(x)    ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x)    ggml_compute_fp32_to_fp16(x)
+#define GGML_FP32_TO_FP16(x)            GGML_COMPUTE_FP32_TO_FP16(x)
+#define GGML_FP16_TO_FP32(x)            ggml_lookup_fp16_to_fp32(x)
+
+#if 0//def NDEBUG
+#define GGMLHEXAGON_DEBUG                                   0
+#else
+#define GGMLHEXAGON_DEBUG                                   1
+#endif
+
+#define GGMLHEXAGON_LOGBUF_LEN                              4096
+#define GGMLHEXAGON_TMPBUF_LEN                              256
+#if GGMLHEXAGON_DEBUG
+#define GGMLHEXAGON_LOG_DEBUG(...)                          ggmlhexagon_log_internal(GGMLHEXAGON_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+#define GGMLHEXAGON_LOG_DEBUG(...)
+#endif
+
+#define GGML_TENSOR_LOCALS_1(type, prefix, pointer, array) \
+    const type prefix##0 = (pointer)->array[0]; \
+    GGML_UNUSED(prefix##0);
+#define GGML_TENSOR_LOCALS_2(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_1    (type, prefix, pointer, array) \
+    const type prefix##1 = (pointer)->array[1]; \
+    GGML_UNUSED(prefix##1);
+#define GGML_TENSOR_LOCALS_3(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_2    (type, prefix, pointer, array) \
+    const type prefix##2 = (pointer)->array[2]; \
+    GGML_UNUSED(prefix##2);
+#define GGML_TENSOR_LOCALS(type, prefix, pointer, array) \
+    GGML_TENSOR_LOCALS_3  (type, prefix, pointer, array) \
+    const type prefix##3 = (pointer)->array[3]; \
+    GGML_UNUSED(prefix##3);
+
+#define GGML_TENSOR_UNARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
+#define GGML_TENSOR_BINARY_OP_LOCALS \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne,  dst,  ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb,  dst,  nb)
+
+#define GGML_TENSOR_BINARY_OP_LOCALS01 \
+    GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb0, src0, nb) \
+    GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
+    GGML_TENSOR_LOCALS(size_t,  nb1, src1, nb)
+
+enum ggmlhexagon_log_level {
+    GGMLHEXAGON_LOG_LEVEL_NONE  = 0,
+    GGMLHEXAGON_LOG_LEVEL_DEBUG = 1,
+    GGMLHEXAGON_LOG_LEVEL_INFO  = 2,
+    GGMLHEXAGON_LOG_LEVEL_WARN  = 3,
+    GGMLHEXAGON_LOG_LEVEL_ERROR = 4,
+    GGMLHEXAGON_LOG_LEVEL_CONT  = 5,
+};
+
+enum ggml_type {
+    GGML_TYPE_F32     = 0,
+    GGML_TYPE_F16     = 1,
+    GGML_TYPE_Q4_0    = 2,
+    GGML_TYPE_Q4_1    = 3,
+    // GGML_TYPE_Q4_2 = 4, support has been removed
+    // GGML_TYPE_Q4_3 = 5, support has been removed
+    GGML_TYPE_Q5_0    = 6,
+    GGML_TYPE_Q5_1    = 7,
+    GGML_TYPE_Q8_0    = 8,
+    GGML_TYPE_Q8_1    = 9,
+    GGML_TYPE_Q2_K    = 10,
+    GGML_TYPE_Q3_K    = 11,
+    GGML_TYPE_Q4_K    = 12,
+    GGML_TYPE_Q5_K    = 13,
+    GGML_TYPE_Q6_K    = 14,
+    GGML_TYPE_Q8_K    = 15,
+    GGML_TYPE_IQ2_XXS = 16,
+    GGML_TYPE_IQ2_XS  = 17,
+    GGML_TYPE_IQ3_XXS = 18,
+    GGML_TYPE_IQ1_S   = 19,
+    GGML_TYPE_IQ4_NL  = 20,
+    GGML_TYPE_IQ3_S   = 21,
+    GGML_TYPE_IQ2_S   = 22,
+    GGML_TYPE_IQ4_XS  = 23,
+    GGML_TYPE_I8      = 24,
+    GGML_TYPE_I16     = 25,
+    GGML_TYPE_I32     = 26,
+    GGML_TYPE_I64     = 27,
+    GGML_TYPE_F64     = 28,
+    GGML_TYPE_IQ1_M   = 29,
+    GGML_TYPE_BF16    = 30,
+    // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
+    // GGML_TYPE_Q4_0_4_8 = 32,
+    // GGML_TYPE_Q4_0_8_8 = 33,
+    GGML_TYPE_TQ1_0   = 34,
+    GGML_TYPE_TQ2_0   = 35,
+    // GGML_TYPE_IQ4_NL_4_4 = 36,
+    // GGML_TYPE_IQ4_NL_4_8 = 37,
+    // GGML_TYPE_IQ4_NL_8_8 = 38,
+    GGML_TYPE_COUNT   = 39,
+};
+
+typedef double      ggml_float;
+typedef uint16_t    ggml_fp16_t;
+typedef uint16_t    ggml_half;
+typedef uint32_t    ggml_half2;
+typedef void        (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
+                                        const void * GGML_RESTRICT y, size_t by, int nrc);
+typedef void        (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
+
+typedef void        (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
+typedef void        (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
+
+struct ggml_compute_params {
+    // ith = thread index, nth = number of threads
+    int ith, nth;
+
+    // work buffer for all threads
+    size_t wsize;
+    void * wdata;
+};
+
+#define QK4_0 32
+typedef struct {
+    ggml_half d;           // delta
+    uint8_t qs[QK4_0 / 2]; // nibbles / quants
+} block_q4_0;
+
+#define QK4_1 32
+typedef struct {
+    union {
+        struct {
+            ggml_half d; // delta
+            ggml_half m; // min
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+    uint8_t qs[QK4_1 / 2]; // nibbles / quants
+} block_q4_1;
+
+#define QK5_0 32
+typedef struct {
+    ggml_half d;           // delta
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2]; // nibbles / quants
+} block_q5_0;
+
+#define QK5_1 32
+typedef struct {
+    union {
+        struct {
+            ggml_half d; // delta
+            ggml_half m; // min
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+    uint8_t qh[4];         // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2]; // nibbles / quants
+} block_q5_1;
+
+#define QK8_0 32
+typedef struct {
+    ggml_half d;       // delta
+    int8_t  qs[QK8_0]; // quants
+} block_q8_0;
+
+#define QK8_1 32
+typedef struct {
+    union {
+        struct {
+            ggml_half d; // delta
+            ggml_half s; // d * sum(qs[i])
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 ds;
+    } GGML_COMMON_AGGR_U;
+    int8_t qs[QK8_1]; // quants
+} block_q8_1;
+
+// 2-bit quantization
+// weight is represented as x = a * q + b
+// 16 blocks of 16 elements each
+// Effectively 2.625 bits per weight
+typedef struct {
+    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
+    uint8_t qs[QK_K/4];      // quants
+    union {
+        struct {
+            ggml_half d;    // super-block scale for quantized scales
+            ggml_half dmin; // super-block scale for quantized mins
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+} block_q2_K;
+
+// 3-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 3.4375 bits per weight
+typedef struct {
+    uint8_t hmask[QK_K/8]; // quants - high bit
+    uint8_t qs[QK_K/4];    // quants - low 2 bits
+    uint8_t scales[12];    // scales, quantized with 6 bits
+    ggml_half d;           // super-block scale
+} block_q3_K;
+
+// 4-bit quantization
+// 8 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 4.5 bits per weight
+typedef struct {
+    union {
+        struct {
+            ggml_half d;    // super-block scale for quantized scales
+            ggml_half dmin; // super-block scale for quantized mins
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qs[QK_K/2];           // 4--bit quants
+} block_q4_K;
+
+// 5-bit quantization
+// 8 blocks of 32 elements each
+// weight is represented as x = a * q + b
+// Effectively 5.5 bits per weight
+typedef struct {
+    union {
+        struct {
+            ggml_half d;    // super-block scale for quantized scales
+            ggml_half dmin; // super-block scale for quantized mins
+        } GGML_COMMON_AGGR_S;
+        ggml_half2 dm;
+    } GGML_COMMON_AGGR_U;
+    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
+    uint8_t qh[QK_K/8];           // quants, high bit
+    uint8_t qs[QK_K/2];           // quants, low 4 bits
+} block_q5_K;
+
+// 6-bit quantization
+// weight is represented as x = a * q
+// 16 blocks of 16 elements each
+// Effectively 6.5625 bits per weight
+typedef struct {
+    uint8_t ql[QK_K/2];      // quants, lower 4 bits
+    uint8_t qh[QK_K/4];      // quants, upper 2 bits
+    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
+    ggml_half d;             // super-block scale
+} block_q6_K;
+
+typedef struct {
+    float   d;              // delta
+    int8_t  qs[QK_K];       // quants
+    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
+} block_q8_K;
+
+struct ggml_type_traits {
+    const char             * type_name;
+    int64_t                  blck_size;
+    int64_t                  blck_size_interleave; // interleave elements in blocks
+    size_t                   type_size;
+    bool                     is_quantized;
+    ggml_to_float_t          to_float;
+    ggml_from_float_t        from_float_ref;
+};
+
+struct ggml_type_traits_cpu {
+    ggml_from_float_t        from_float;
+    ggml_vec_dot_t           vec_dot;
+    enum ggml_type           vec_dot_type;
+    int64_t                  nrows; // number of rows to process simultaneously
+};
+
+#ifdef  __cplusplus
+}
+#endif
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index 54c4eb6e1f851..f76fd9a749fbf 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -1,5 +1,9 @@
 [general]
+#version of ggml-hexagon.cpp on ARM-AP side
 version = "1.00"
+#version of ggml-dsp.c on cDSP side
+ggmldsp_version = "0.60"
+
 #0: HEXAGON_BACKEND_QNNCPU
 #1: HEXAGON_BACKEND_QNNGPU
 #2: HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP

From 9fc34e7866ac59297800c876cdfc982c2c535013 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sat, 5 Apr 2025 11:43:02 +0800
Subject: [PATCH 167/200] ggml-hexagon: merge build logic in kernels/Makefile
 to ggml-hexagon/CMakeLists.txt and remove Makefile

---
 ggml/src/ggml-hexagon/CMakeLists.txt          | 24 +++++++++++++++---
 ggml/src/ggml-hexagon/kernels/Makefile        | 25 -------------------
 ggml/src/ggml-hexagon/kernels/ggml-dsp.c      |  2 ++
 ggml/src/ggml-hexagon/kernels/ggml-dsp.h      |  2 +-
 .../src/ggml-hexagon/kernels/ggmlop_ap_skel.c |  2 --
 5 files changed, 23 insertions(+), 32 deletions(-)
 delete mode 100755 ggml/src/ggml-hexagon/kernels/Makefile

diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index 1814cdf4bb194..d4cf76cad3c3f 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -17,6 +17,22 @@ message("QNN_SDK_PATH    : ${QNN_SDK_PATH}")
 message("HEXAGON_SDK_PATH: ${HEXAGON_SDK_PATH}")
 message("HTP_ARCH_VERSION: ${HTP_ARCH_VERSION}")
 
+#v68 --- Snapdragon 888
+#v69 --- Snapdragon 8 Gen1
+#v73 --- Snapdragon 8 Gen2
+#v75 --- Snapdragon 8 Gen3
+#v79 --- Snapdragon 8 Elite(aka Gen4)
+if(NOT DEFINED HTP_ARCH_VERSION)
+    #set default HTP_ARCH_VERSION to v75
+    set(HTP_ARCH_VERSION v75)
+endif()
+
+#cross compiling for hexagon kernels on cDSP side
+set(HEXAGON_CC              "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang")
+set(HEXAGON_CXX             "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang")
+set(HEXAGON_TARGET          libggmlop_skel${HTP_ARCH_VERSION}.so)
+set(HEXAGON_KERNELS_PATH    "${CMAKE_CURRENT_LIST_DIR}/kernels")
+
 if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     find_library(LOG_LIB log)
 
@@ -67,10 +83,10 @@ function(ggml_hexagon_build_kernel KNAME)
         TARGET ${PROJECT_NAME}
         POST_BUILD
         COMMAND echo "current working path:`pwd`\n"
-        COMMAND echo "${CMAKE_CURRENT_LIST_DIR}/kernels"
-        COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ clean
-        COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ HEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} HTP_ARCH_VERSION=${HTP_ARCH_VERSION}
-        COMMAND echo "`pwd`"
+        COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -c ${HEXAGON_KERNELS_PATH}/ggml-dsp.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc
+        COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o -c ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc
+        COMMAND ${HEXAGON_CC} -m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${HEXAGON_TARGET} -o ${HEXAGON_KERNELS_PATH}/${HEXAGON_TARGET} -Wl,--start-group ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -Wl,--end-group
+        COMMAND ls -l ${HEXAGON_KERNELS_PATH}/${HEXAGON_TARGET}
         COMMENT "build hexagon-kernel"
     )
 endfunction()
diff --git a/ggml/src/ggml-hexagon/kernels/Makefile b/ggml/src/ggml-hexagon/kernels/Makefile
deleted file mode 100755
index 1a39011a02664..0000000000000
--- a/ggml/src/ggml-hexagon/kernels/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-TARGET=libggmlop_skel${HTP_ARCH_VERSION}.so
-
-$(info HEXAGON_SDK_PATH:${HEXAGON_SDK_PATH})
-$(info HTP_ARCH_VERSION:${HTP_ARCH_VERSION})
-
-INCS=-I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc
-
-CFLAGS=-m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B ${INCS}
-
-LDFLAGS=-m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${TARGET}
-
-SRCS = ggml-dsp.c  ggmlop_cdsp_skel.c
-OBJS = $(patsubst %.c, %.o, $(SRCS))
-
-ALL:$(OBJS)
-		${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang ${LDFLAGS} -o ${TARGET} -Wl,--start-group ${OBJS} -Wl,--end-group
-		@ls -l ${TARGET}
-
-%.o:%.c
-		@echo "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang ${CFLAGS} -D__FILENAME__=\"$<\" -o $@ -c $< "
-		${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang ${CFLAGS} -D__FILENAME__=\"$<\" -o $@ -c $<
-		@echo "\n"
-
-clean:
-	rm -f *.o
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
index 43b7017f5abf4..38f80b398bfb6 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -299,7 +299,9 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
 //  section-2: ggml-hexagon kernel's internal troubleshooting function
 // =================================================================================================
 static void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) {
+#if !GGMLHEXAGON_DEBUG
     return;
+#endif
     static char s_ggmlhexagon_log_internal_buf[GGMLHEXAGON_LOGBUF_LEN];
     va_list args;
     va_start(args, format);
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
index e7c5633cc93a9..018c8cbbf262b 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
@@ -50,7 +50,7 @@ extern "C" {
 #define GGML_FP32_TO_FP16(x)            GGML_COMPUTE_FP32_TO_FP16(x)
 #define GGML_FP16_TO_FP32(x)            ggml_lookup_fp16_to_fp32(x)
 
-#if 0//def NDEBUG
+#ifdef NDEBUG
 #define GGMLHEXAGON_DEBUG                                   0
 #else
 #define GGMLHEXAGON_DEBUG                                   1
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
index 3efbabdb0f67d..025735aad8cbf 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
+++ b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
@@ -405,11 +405,9 @@ static __inline int _stub_method_1(remote_handle64 _handle, uint32_t _mid, uintp
    _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30])));
    _count(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30])));
    if(_numIn[0]>=255){
-          _QAIC_FARF(RUNTIME_ERROR, "ERROR: Unsupported number of input buffers\n");
           return AEE_EUNSUPPORTED;
    }
    if(_numROut[0]>=255){
-          _QAIC_FARF(RUNTIME_ERROR, "ERROR: Unsupported number of output buffers\n");
           return AEE_EUNSUPPORTED;
    }
    _allocator_init(_al, 0, 0);

From 0978e280d035b1e454e9a3877857b5cd2912d847 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sat, 5 Apr 2025 18:55:44 +0800
Subject: [PATCH 168/200] ggml-hexagon: fix a typo in ggml-hexagon.cpp

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp   | 20 +++++++++-----------
 ggml/src/ggml-hexagon/kernels/ggml-dsp.h |  3 ++-
 scripts/build-run-android.sh             |  1 +
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index c0835ab6e9a9f..71e79e00fc666 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -5086,13 +5086,15 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
         case GGML_OP_MUL_MAT:
         {
             ggmlhexagon_dump_op_info(op_tensor);
-            if (1 == g_hexagon_appcfg.enable_q_mulmat)
+            if (1 == g_hexagon_appcfg.enable_q_mulmat) {
                 return (src0->type == GGML_TYPE_F32
                         || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
                         || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
                        ) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
-            else
-                return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) && (op_tensor->type == GGML_TYPE_F32);
+            } else {
+                return (src0->type == GGML_TYPE_F32) && (src1->type == GGML_TYPE_F32) &&
+                       (op_tensor->type == GGML_TYPE_F32);
+            }
         }
         case GGML_OP_SOFT_MAX:{
             if (!ggml_is_contiguous(op_tensor))
@@ -5124,13 +5126,9 @@ static bool ggmlhexagon_can_handle_op_through_qnn(ggml_backend_dev_t dev, const
 
     struct ggml_tensor * src0 = op_tensor->src[0];
     struct ggml_tensor * src1 = op_tensor->src[1];
-    int64_t ne00        = 0;
-    uint32_t src0_rank  = 0;
-    uint32_t src1_rank  = 0;
-    if (nullptr != src0) {
-        src0_rank = ggml_n_dims(src0);
-        ne00      = src0->ne[0];
-    }
+    const int64_t ne00  = src0->ne[0];;
+    const int src0_rank = ggml_n_dims(src0);
+    int src1_rank       = 0;
     if (nullptr != src1) {
         src1_rank = ggml_n_dims(src1);
     }
@@ -6023,7 +6021,7 @@ const char * ggml_backend_hexagon_get_devname(size_t dev_num) {
         case HEXAGON_BACKEND_QNNNPU:
             return "HEXAGON_BACKEND_QNN_NPU";
         case HEXAGON_BACKEND_GGML:
-            return "ggml"; //"fake" QNN backend, used for compare performance between hexagon backend and the default ggml backend
+            return "ggml"; //"fake" hexagon backend, used for compare performance between hexagon backend and the default ggml backend
         default:
             return "unknown";
     }
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
index 018c8cbbf262b..1953f9ceaa539 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
@@ -50,7 +50,8 @@ extern "C" {
 #define GGML_FP32_TO_FP16(x)            GGML_COMPUTE_FP32_TO_FP16(x)
 #define GGML_FP16_TO_FP32(x)            ggml_lookup_fp16_to_fp32(x)
 
-#ifdef NDEBUG
+//NPU performance will be slower when enable GGMLHEXAGON_DEBUG
+#if 1//def NDEBUG
 #define GGMLHEXAGON_DEBUG                                   0
 #else
 #define GGMLHEXAGON_DEBUG                                   1
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index fe0b84b223f1c..2251f60a52de5 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -18,6 +18,7 @@ QNN_SDK_INSTALL_PATH=/opt/qcom/aistack/qairt/
 QNN_SDK_VERSION=2.32.0.250228
 QNN_SDK_PATH=${QNN_SDK_INSTALL_PATH}/${QNN_SDK_VERSION}
 
+#5.5.3.0 should be also ok because someone told me can't find 6.2.0.1 on 04/05/2025
 HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
 #available htp arch version:
 #v68 --- Snapdragon 888

From 0229a594e8fed8dfeec72943debc46055f638206 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sun, 6 Apr 2025 14:49:22 +0800
Subject: [PATCH 169/200] ggml-hexagon: uniform NDEBUG usage in
 ggml-hexagon.cpp and ggml-dsp.c

---
 ggml/src/ggml-hexagon/CMakeLists.txt     | 16 +++++++--
 ggml/src/ggml-hexagon/ggml-hexagon.cpp   | 44 ++++++++++++++++++++----
 ggml/src/ggml-hexagon/kernels/ggml-dsp.h |  2 +-
 scripts/build-run-android.sh             | 31 +++++++++++++++--
 scripts/ggml-hexagon.cfg                 |  5 ++-
 5 files changed, 84 insertions(+), 14 deletions(-)

diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index d4cf76cad3c3f..361b444932801 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -17,6 +17,15 @@ message("QNN_SDK_PATH    : ${QNN_SDK_PATH}")
 message("HEXAGON_SDK_PATH: ${HEXAGON_SDK_PATH}")
 message("HTP_ARCH_VERSION: ${HTP_ARCH_VERSION}")
 
+if (CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(DEBUG_FLAG "-Wall")
+    message("Debug mode:${DEBUG_FLAG}")
+else()
+    set(DEBUG_FLAG "-DNDEBUG -Wall")
+    message("Release mode:${DEBUG_FLAG}")
+endif()
+
+
 #v68 --- Snapdragon 888
 #v69 --- Snapdragon 8 Gen1
 #v73 --- Snapdragon 8 Gen2
@@ -32,6 +41,7 @@ set(HEXAGON_CC              "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tool
 set(HEXAGON_CXX             "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang")
 set(HEXAGON_TARGET          libggmlop_skel${HTP_ARCH_VERSION}.so)
 set(HEXAGON_KERNELS_PATH    "${CMAKE_CURRENT_LIST_DIR}/kernels")
+set(HEXAGON_COMPUTE         "compute${HTP_ARCH_VERSION}")
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     find_library(LOG_LIB log)
@@ -64,7 +74,7 @@ else()
     message(FATAL_ERROR "QNN now only available on Android and Windows(Windows on ARM)")
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_QNN")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG}")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
 
 file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/ggmlop_ap_skel.c")
@@ -83,8 +93,8 @@ function(ggml_hexagon_build_kernel KNAME)
         TARGET ${PROJECT_NAME}
         POST_BUILD
         COMMAND echo "current working path:`pwd`\n"
-        COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -c ${HEXAGON_KERNELS_PATH}/ggml-dsp.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc
-        COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o -c ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc
+        COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -c ${HEXAGON_KERNELS_PATH}/ggml-dsp.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic ${DEBUG_FLAG} -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -fno-finite-math-only -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/posix -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/qurt/
+        COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o -c ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -fno-finite-math-only -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc
         COMMAND ${HEXAGON_CC} -m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${HEXAGON_TARGET} -o ${HEXAGON_KERNELS_PATH}/${HEXAGON_TARGET} -Wl,--start-group ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -Wl,--end-group
         COMMAND ls -l ${HEXAGON_KERNELS_PATH}/${HEXAGON_TARGET}
         COMMENT "build hexagon-kernel"
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 71e79e00fc666..b06be91dcc5d2 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -129,7 +129,7 @@
 class  qnn_instance;
 struct ggml_backend_hexagon_context;
 
-#if 0//def NDEBUG
+#ifdef NDEBUG
 #define GGMLHEXAGON_DEBUG                               0
 #else
 #define GGMLHEXAGON_DEBUG                               1
@@ -141,6 +141,7 @@ struct ggml_backend_hexagon_context;
 #define GGMLHEXAGON_LOG_ERROR(...)                      ggmlhexagon_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #define GGMLHEXAGON_LOG_WARN(...)                       ggmlhexagon_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #define GGMLHEXAGON_LOG_INFO(...)                       ggmlhexagon_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#define GGMLHEXAGON_LOG_VERBOSE(...)                    ggmlhexagon_log_internal(GGML_LOG_LEVEL_CONT , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 
 #if GGMLHEXAGON_DEBUG
 #define GGMLHEXAGON_LOG_DEBUG(...)                      ggmlhexagon_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
@@ -154,6 +155,10 @@ struct ggml_backend_hexagon_context;
 #define SIZE_IN_MB                                      (1 << 20)
 #define STATUS_CONTEXT                                  0x12345678
 
+#if !defined (_WINDOWS)
+#pragma weak remote_system_request
+#endif
+
 #define CHECK_QNN_API(error, result)                                            \
     do {                                                                        \
         error = (result);                                                       \
@@ -316,9 +321,11 @@ struct hexagon_appcfg_t {
     int hexagon_backend;        // 0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP
     int enable_rpc_ion_mempool; // enable/disable rpc ion memory pool
     int enable_rpc_dma_mempool; // enable/disable rpc dma memory pool
+    int enable_all_q_mulmat;    // enable/disable offload all quantized type mulmat to cDSP
     const char * cfgfilename;
     const char * runtime_libpath;
     char ggml_hexagon_version[GGMLHEXAGON_TMPBUF_LEN];
+    char ggml_dsp_version[GGMLHEXAGON_TMPBUF_LEN];
 };
 
 static struct hexagon_appcfg_t g_hexagon_appcfg = {
@@ -335,6 +342,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
         .hexagon_backend        = HEXAGON_BACKEND_CDSP,
         .enable_rpc_ion_mempool = 0,
         .enable_rpc_dma_mempool = 0,
+        .enable_all_q_mulmat    = 0,
         .cfgfilename            = "ggml-hexagon.cfg",
 #if defined(__ANDROID__)
 //Android command line program
@@ -344,7 +352,8 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
 #elif defined(_WIN32)
         .qnn_runtimelib_path    = "C:\\",
 #endif
-        .ggml_hexagon_version   = {"1.00"},
+        .ggml_hexagon_version   = {"1.01"},
+        .ggml_dsp_version       = {"0.60"},
 };
 
 //file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
@@ -857,6 +866,7 @@ static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * c
     memset(timestamp, 0, GGMLHEXAGON_TMPBUF_LEN);
 
     GGMLHEXAGON_LOG_INFO("ggml_hexagon_version:             %s", g_hexagon_appcfg.ggml_hexagon_version);
+    GGMLHEXAGON_LOG_INFO("ggml_dsp_version:                 %s", g_hexagon_appcfg.ggml_dsp_version);
     GGMLHEXAGON_LOG_INFO("hwaccel approach:                 %d(%s)", g_hexagon_appcfg.hwaccel_approach,
                      ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
     GGMLHEXAGON_LOG_INFO("hexagon_backend:                  %d(%s)", g_hexagon_appcfg.hexagon_backend,
@@ -891,7 +901,7 @@ class hexagon_perf {
             return;
         _end_time = ggml_time_us();
         _duration = (_end_time - _begin_time);
-        GGMLHEXAGON_LOG_DEBUG("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
+        GGMLHEXAGON_LOG_VERBOSE("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
     }
 
 private:
@@ -1454,7 +1464,9 @@ static void ggmlhexagon_load_cfg() {
     qnncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32");
     qnncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 1);
     qnncfg_instance.get_intvalue("cdsp", "enable_rpc_dma_mempool", g_hexagon_appcfg.enable_rpc_dma_mempool, 0);
+    qnncfg_instance.get_intvalue("cdsp", "enable_all_q_mulmat", g_hexagon_appcfg.enable_all_q_mulmat, 0);
     GGMLHEXAGON_LOG_INFO("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version);
+    GGMLHEXAGON_LOG_INFO("internal ggml_dsp_version=%s", g_hexagon_appcfg.ggml_dsp_version);
     GGMLHEXAGON_LOG_INFO("external ggml_hexagon_version=%s", ggml_hexagon_version.c_str());
     GGMLHEXAGON_LOG_INFO("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
                          ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
@@ -1504,6 +1516,13 @@ static bool ggmlhexagon_check_valid_appcfg() {
             GGMLHEXAGON_LOG_INFO("rpc dma mempool not supported");
             is_valid_appcfg = false;
         }
+
+        if (1 == g_hexagon_appcfg.enable_all_q_mulmat) {
+            if (0 == g_hexagon_appcfg.enable_q_mulmat) {
+                GGMLHEXAGON_LOG_INFO("ensure set enable_q_mulmat to 1 firstly when set enable_all_q_mulmat to 1");
+                is_valid_appcfg = false;
+            }
+        }
     }
 
     if (!is_valid_appcfg) {
@@ -2743,6 +2762,10 @@ static void ggmlqnn_sdk_logcallback(const char * fmt,
         vsnprintf(reinterpret_cast<char *const>(s_ggmlqnn_sdk_logbuf), GGMLHEXAGON_LOGBUF_LEN, fmt, argp);
         GGMLHEXAGON_LOG_DEBUG("%8.1fms [%-7s] %s\n", ms, log_level_desc, s_ggmlqnn_sdk_logbuf);
     }
+#if !GGMLHEXAGON_DEBUG
+    GGML_UNUSED(log_level_desc);
+    GGML_UNUSED(ms);
+#endif
 }
 
 int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
@@ -5075,6 +5098,7 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
 
     const struct ggml_tensor * src0 = op_tensor->src[0];
     const struct ggml_tensor * src1 = op_tensor->src[1];
+    const int src0_rank = ggml_n_dims(src0);
     switch (op_tensor->op) {
         case GGML_OP_ADD:
         {
@@ -5086,7 +5110,15 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
         case GGML_OP_MUL_MAT:
         {
             ggmlhexagon_dump_op_info(op_tensor);
+            //FIXME:remove this filter in the future
+            if (2 != src0_rank) {
+                return false;
+            }
             if (1 == g_hexagon_appcfg.enable_q_mulmat) {
+                if (1 == g_hexagon_appcfg.enable_all_q_mulmat) {
+                    return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && (src1->type == GGML_TYPE_F32);
+                }
+
                 return (src0->type == GGML_TYPE_F32
                         || src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q8_0
                         || src0->type == GGML_TYPE_Q6_K || src0->type == GGML_TYPE_Q8_K
@@ -5126,9 +5158,9 @@ static bool ggmlhexagon_can_handle_op_through_qnn(ggml_backend_dev_t dev, const
 
     struct ggml_tensor * src0 = op_tensor->src[0];
     struct ggml_tensor * src1 = op_tensor->src[1];
-    const int64_t ne00  = src0->ne[0];;
-    const int src0_rank = ggml_n_dims(src0);
-    int src1_rank       = 0;
+    const int64_t ne00        = src0->ne[0];;
+    const int src0_rank       = ggml_n_dims(src0);
+    int src1_rank             = 0;
     if (nullptr != src1) {
         src1_rank = ggml_n_dims(src1);
     }
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
index 1953f9ceaa539..c77e45391205e 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
@@ -51,7 +51,7 @@ extern "C" {
 #define GGML_FP16_TO_FP32(x)            ggml_lookup_fp16_to_fp32(x)
 
 //NPU performance will be slower when enable GGMLHEXAGON_DEBUG
-#if 1//def NDEBUG
+#ifdef NDEBUG
 #define GGMLHEXAGON_DEBUG                                   0
 #else
 #define GGMLHEXAGON_DEBUG                                   1
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 2251f60a52de5..3d9ec2901756b 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -18,7 +18,7 @@ QNN_SDK_INSTALL_PATH=/opt/qcom/aistack/qairt/
 QNN_SDK_VERSION=2.32.0.250228
 QNN_SDK_PATH=${QNN_SDK_INSTALL_PATH}/${QNN_SDK_VERSION}
 
-#5.5.3.0 should be also ok because someone told me can't find 6.2.0.1 on 04/05/2025
+#5.5.3.0 should be also ok
 HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
 #available htp arch version:
 #v68 --- Snapdragon 888
@@ -132,6 +132,16 @@ function build_arm64
     cd -
 }
 
+function build_arm64_debug
+{
+    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Debug -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_HEXAGON=ON -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION}
+    cd out/android
+    make -j16
+    show_pwd
+
+    cd -
+}
+
 
 function remove_temp_dir()
 {
@@ -177,7 +187,7 @@ function update_qnn_cfg()
 }
 
 
-function build_ggml_qnn()
+function build_ggml_hexagon()
 {
     show_pwd
     check_and_download_ndk
@@ -188,6 +198,17 @@ function build_ggml_qnn()
     build_arm64
 }
 
+function build_ggml_hexagon_debug()
+{
+    show_pwd
+    check_and_download_ndk
+    check_and_download_qnn_sdk
+    check_hexagon_sdk
+    dump_vars
+    remove_temp_dir
+    build_arm64_debug
+}
+
 
 function prepare_run_on_phone()
 {
@@ -341,6 +362,7 @@ function show_usage()
     echo "  $0 help"
     echo "  $0 print_oplist"
     echo "  $0 build"
+    echo "  $0 build_debug (enable debug log for developers on ARM-AP side and cDSP side)"
     echo "  $0 updateqnnlib"
     echo "  $0 run_testops"
     echo "  $0 run_testop          [ADD/MUL_MAT]"
@@ -371,7 +393,10 @@ elif [ $# == 1 ]; then
         print_oplist
         exit 1
     elif [ "$1" == "build" ]; then
-        build_ggml_qnn
+        build_ggml_hexagon
+        exit 0
+    elif [ "$1" == "build_debug" ]; then
+        build_ggml_hexagon_debug
         exit 0
     elif [ "$1" == "run_testops" ]; then
         run_test-ops
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index f76fd9a749fbf..462ab09fbdc81 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -1,6 +1,6 @@
 [general]
 #version of ggml-hexagon.cpp on ARM-AP side
-version = "1.00"
+version = "1.01"
 #version of ggml-dsp.c on cDSP side
 ggmldsp_version = "0.60"
 
@@ -44,3 +44,6 @@ precision_mode = "fp16"
 enable_rpc_ion_mempool = 0
 #enable/disable rpc dma memory pool
 enable_rpc_dma_mempool = 0
+#enable/disable offload all quantized type mulmat to cDSP
+#ensure enable_q_mulmat already be setting to 1
+enable_all_q_mulmat = 0

From e5da565a59893fcd71991192ae4325a3ae2cf390 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Mon, 7 Apr 2025 21:45:30 +0800
Subject: [PATCH 170/200] ggml-hexagon: add profiler feature for purpose of
 visualize NPU performance and release ggml-hexagon.cpp v1.80

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp   | 560 +++++++++++++++++++----
 ggml/src/ggml-hexagon/kernels/ggml-dsp.c |  11 +-
 scripts/build-run-android.sh             |   4 +-
 scripts/ggml-hexagon.cfg                 |  94 +++-
 4 files changed, 549 insertions(+), 120 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index b06be91dcc5d2..208024258f50b 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -127,6 +127,7 @@
 //  section-1: forward/prototype declaration, global vars, macros, data structures
 // =================================================================================================
 class  qnn_instance;
+class  hexagon_profiler;
 struct ggml_backend_hexagon_context;
 
 #ifdef NDEBUG
@@ -309,7 +310,8 @@ struct hexagon_op_caps {
 
 struct hexagon_appcfg_t {
     int print_qnn_internal_log; // enable/disable QNN's internal log
-    int enable_perf;            // enable/disable perf of op function
+    int enable_perf;            // enable/disable perf of a specified ggml op
+    int enable_profiler;        // enable/disable profiler feature to visualization comparison between HWACCEL_CDSP and HWACCEL_QNN
     int print_tensors_info;     // enable/disable print tensors info in op function
     int dump_op_info;           // enable/disable dump op info in handle_op
     int enable_q_mulmat;        // enable/disable offload quantized mulmat
@@ -322,6 +324,8 @@ struct hexagon_appcfg_t {
     int enable_rpc_ion_mempool; // enable/disable rpc ion memory pool
     int enable_rpc_dma_mempool; // enable/disable rpc dma memory pool
     int enable_all_q_mulmat;    // enable/disable offload all quantized type mulmat to cDSP
+    int profiler_duration;      // threshold of duration in profiler, per seconds
+    int profiler_counts;        // threshold of counts in profiler
     const char * cfgfilename;
     const char * runtime_libpath;
     char ggml_hexagon_version[GGMLHEXAGON_TMPBUF_LEN];
@@ -330,7 +334,8 @@ struct hexagon_appcfg_t {
 
 static struct hexagon_appcfg_t g_hexagon_appcfg = {
         .print_qnn_internal_log = 0,
-        .enable_perf            = 0,
+        .enable_perf            = 1,
+        .enable_profiler        = 0,
         .print_tensors_info     = 0,
         .dump_op_info           = 0,
         .enable_q_mulmat        = 0,
@@ -343,6 +348,8 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
         .enable_rpc_ion_mempool = 0,
         .enable_rpc_dma_mempool = 0,
         .enable_all_q_mulmat    = 0,
+        .profiler_duration      = 5,
+        .profiler_counts        = 100,
         .cfgfilename            = "ggml-hexagon.cfg",
 #if defined(__ANDROID__)
 //Android command line program
@@ -352,7 +359,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
 #elif defined(_WIN32)
         .qnn_runtimelib_path    = "C:\\",
 #endif
-        .ggml_hexagon_version   = {"1.01"},
+        .ggml_hexagon_version   = {"1.80"},
         .ggml_dsp_version       = {"0.60"},
 };
 
@@ -714,8 +721,39 @@ static int32_t g_qnntensor_idx = 0; //ensure every QNN tensor name is unique
 static int32_t g_qnnopcfg_idx  = 0; //ensure every QNN opconfig name is unique
 
 // =================================================================================================
-//  section-2: ggml-hexagon internal troubleshooting function/class
+//  section-2: ggml-hexagon internal troubleshooting and profiler function/class
 // =================================================================================================
+static const char * ggmlhexagon_get_hwaccel_approach_name(int hwaccle_approach) {
+    switch (hwaccle_approach) {
+        case HWACCEL_QNN:
+            return "HWACCEL_QNN";
+        case HWACCEL_QNN_SINGLEGRAPH:
+            return "HWACCEL_QNN_SINGLEGRAPH";
+        case HWACCEL_CDSP:
+            return "HWACCEL_CDSP";
+        default:
+            return "unknown hwaccel approach";
+    }
+}
+
+static void ggmlhexagon_get_timestring(char * p_currenttime) {
+#if defined(__ANDROID__) || defined(__linux__)
+    time_t n_seconds    = 0;
+    struct tm now_time;
+
+    if (nullptr == p_currenttime)
+        return;
+
+    time(&n_seconds);
+    localtime_r(&n_seconds, &now_time);
+    snprintf(p_currenttime, GGMLHEXAGON_TMPBUF_LEN, "%04d-%02d-%02d,%02d:%02d:%02d",
+             now_time.tm_year + 1900, now_time.tm_mon + 1, now_time.tm_mday,
+             now_time.tm_hour, now_time.tm_min, now_time.tm_sec);
+#else
+    //TODO: WoA
+#endif
+}
+
 static void ggmlhexagon_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...) {
     static std::mutex ggmlhexagon_log_internal_mutex;
     static char s_ggmlhexagon_log_internal_buf[GGMLHEXAGON_LOGBUF_LEN];
@@ -829,66 +867,313 @@ static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, const char * nam
     GGMLHEXAGON_LOG_DEBUG("\n");
 }
 
-static const char * ggmlhexagon_get_hwaccel_approach_name(int hwaccle_approach) {
-    switch (hwaccle_approach) {
-        case HWACCEL_QNN:
-            return "HWACCEL_QNN";
-        case HWACCEL_QNN_SINGLEGRAPH:
-            return "HWACCEL_QNN_SINGLEGRAPH";
-        case HWACCEL_CDSP:
-            return "HWACCEL_CDSP";
-        default:
-            return "unknown hwaccel approach";
+//a simple high-cohesion and low-coupling class to collect necessary profiler data and visualize NPU performance accordingly
+class hexagon_profiler {
+public:
+    static hexagon_profiler & get_instance() {
+        //make thread-safety without using complex dynamic resource management
+        static hexagon_profiler instance;
+        return instance;
     }
-}
 
-static void ggmlhexagon_get_timestring(char * p_currenttime) {
-#if defined(__ANDROID__) || defined(__linux__)
-    time_t n_seconds    = 0;
-    struct tm now_time;
+public:
+    void profiler_init(int profiler_threshold_duration, int profiler_threshold_counts) {
+        reset();
+        //here is not accurate profiler start time because inference wasn't launched at the moment
+        _profiler_starttime = ggml_time_us();
 
-    if (nullptr == p_currenttime)
-        return;
+        _profiler_threshold_duration = profiler_threshold_duration;
+        _profiler_threshold_counts   = profiler_threshold_counts;
 
-    time(&n_seconds);
-    localtime_r(&n_seconds, &now_time);
-    snprintf(p_currenttime, GGMLHEXAGON_TMPBUF_LEN, "%04d-%02d-%02d,%02d:%02d:%02d",
-             now_time.tm_year + 1900, now_time.tm_mon + 1, now_time.tm_mday,
-             now_time.tm_hour, now_time.tm_min, now_time.tm_sec);
-#else
-    //TODO: WoA
-#endif
-}
+        //FIXME:hardcode filename of profiler data
+        std::string filename = std::string(g_hexagon_appcfg.runtime_libpath) + "/";
+        if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+            if (0 == g_hexagon_appcfg.enable_rpc_ion_mempool) {
+                filename = filename + "hexagon_perf_cdsp.dat";
+            } else {
+                filename = filename + "hexagon_perf_cdsp_ion.dat";
+            }
+        } else {
+            filename = filename + "hexagon_perf_qnn.dat";
+        }
+        GGMLHEXAGON_LOG_DEBUG("profiler name:%s", filename.c_str());
+        const char * profiler_filename = filename.c_str();
+        _fp_profile_file = fopen(profiler_filename, "w");
+        if (nullptr == _fp_profile_file) {
+            GGMLHEXAGON_LOG_WARN("can't open profiler file %s, reason:%s", profiler_filename, strerror(errno));
+            reset();
+            return;
+        } else {
+            size_t written_size = 0;
+            char profiler_info[GGMLHEXAGON_TMPBUF_LEN];
+            const char * prefix = "### starting hexagon profiler at ";
+
+            written_size = fwrite(prefix, 1, strlen(prefix), _fp_profile_file);
+            if (written_size != strlen(prefix)) {
+                GGMLHEXAGON_LOG_WARN("write data to file %s failed, reason: %s", profiler_filename, strerror(errno));
+                reset();
+                return;
+            }
 
-static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx);
-static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * ctx) {
-    char timestamp[GGMLHEXAGON_TMPBUF_LEN];
-    memset(timestamp, 0, GGMLHEXAGON_TMPBUF_LEN);
+            memset(profiler_info, 0, GGMLHEXAGON_TMPBUF_LEN);
+            ggmlhexagon_get_timestring(profiler_info);
+            written_size = fwrite(profiler_info, 1, strlen(profiler_info), _fp_profile_file);
+            if (written_size != strlen(profiler_info)) {
+                GGMLHEXAGON_LOG_WARN("write data to file %s failed, reason: %s", profiler_filename, strerror(errno));
+                reset();
+                return;
+            }
+            fprintf(_fp_profile_file, "\n\n");
+            fprintf(_fp_profile_file,
+                    "#frame     input   max     total     avg         elapse     frame       max        total      avg\n");
+            fprintf(_fp_profile_file,
+                    "#                                                           inference   inference  inference  inference\n");
+            fprintf(_fp_profile_file,
+                    "#index     len     i-len   i-len     i-speed     time       time        time       time       time\n");
+            fprintf(_fp_profile_file, "\n\n");
+        }
+        _enable_profiler = true;
+    }
 
-    GGMLHEXAGON_LOG_INFO("ggml_hexagon_version:             %s", g_hexagon_appcfg.ggml_hexagon_version);
-    GGMLHEXAGON_LOG_INFO("ggml_dsp_version:                 %s", g_hexagon_appcfg.ggml_dsp_version);
-    GGMLHEXAGON_LOG_INFO("hwaccel approach:                 %d(%s)", g_hexagon_appcfg.hwaccel_approach,
-                     ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
-    GGMLHEXAGON_LOG_INFO("hexagon_backend:                  %d(%s)", g_hexagon_appcfg.hexagon_backend,
-                     ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend));
-    ggmlhexagon_get_timestring(timestamp);
-    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
-        GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
-        GGMLHEXAGON_LOG_INFO("using rpc ion memory pool:        %s", g_hexagon_appcfg.enable_rpc_ion_mempool ? "YES" : "NO");
-        GGMLHEXAGON_LOG_INFO("using rpc dma memory pool:        %s", g_hexagon_appcfg.enable_rpc_dma_mempool ? "YES" : "NO");
-        ggmlhexagon_probe_dspinfo(ctx);
-    } else {
-        GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
+    void profiler_deinit() {
+        if (nullptr != _fp_profile_file) {
+            fclose(_fp_profile_file);
+            _fp_profile_file = nullptr;
+        }
+        reset();
     }
-    GGMLHEXAGON_LOG_INFO("running timestamp:%s", timestamp);
-}
 
+/**
+ * \param inference_time          microseconds, inference time for a single GGML op
+ * \param inference_input_size    bytes, total input data size for a single GGML op
+ * \param inference_output_size   bytes, total output data size for a single GGML op
+ */
+    void profiler_update_profilerdata(const char * ggml_opname, int inference_time, int inference_input_size, int inference_output_size) {
+        if (!_enable_profiler)
+            return;
+
+        //1.get the accurate profiler starting time in this function when frame index is 0
+        //2.update frame index in this function accordingly
+        profiler_update_frameindex();
+
+        int64_t elapse_time = ggml_time_us() - profiler_get_starttime();
+        profiler_update_elapsetime(elapse_time);
+        if (elapse_time > (_profiler_threshold_duration * SIZE_IN_MB)) {
+            //do nothing when elapsed profiler time > profiler_duration in ggml-hexagon.cfg
+            return;
+        }
+        if (profiler_get_frame_index() >= _profiler_threshold_counts) {
+            //do nothing when frame_index >= profiler_counts in ggml-hexagon.cfg
+            return;
+        }
+
+        if (inference_input_size > profiler_get_max_inputsize()) {
+            profiler_set_max_inputsize(inference_input_size);
+        }
+
+        if (inference_output_size > profiler_get_max_outputsize()) {
+            profiler_set_max_outputsize(inference_output_size);
+        }
+
+        if (inference_time > profiler_get_max_inferencetime()) {
+            profiler_set_max_inferencetime(inference_time);
+        }
+
+        profiler_update_total_inputsize(inference_input_size);
+        profiler_update_total_outputsize(inference_output_size);
+        profiler_update_total_inferencetime(inference_time);
+        profiler_update_elapsetime(elapse_time);
+
+        if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+            if (10 > _frame_index) {
+                //FIXME:why some initial profiler data in llama-cli looks unusual
+                //return;
+            }
+        }
+
+        if (0 == elapse_time) {
+            //filter invalid profiler data
+            GGMLHEXAGON_LOG_DEBUG("_enable_profiler %d", _enable_profiler);
+            return;
+        }
+
+        if (NULL != _fp_profile_file) {
+            fprintf(_fp_profile_file, "%-8d  %-6d  %-6d  %-10ld %-11ld %-10ld %-12d %-9d %-11ld %-3ld\n",
+                    profiler_get_frame_index(),
+                    inference_input_size,
+                    profiler_get_max_inputsize(),
+                    profiler_get_total_inputputsize(),
+                    profiler_get_total_inputputsize() / profiler_get_frame_index(),
+
+                    elapse_time,
+                    inference_time,
+                    profiler_get_max_inferencetime(),
+                    profiler_get_total_inferencetime(),
+                    profiler_get_total_inferencetime() / profiler_get_frame_index()
+            );
+        }
+
+        //print/compare NPU's I/O performance between 8Gen3 and 8Elite(aka 8Gen4) , removed in the future
+        char bps_string[GGMLHEXAGON_TMPBUF_LEN];
+        memset(bps_string, 0, GGMLHEXAGON_TMPBUF_LEN);
+        profiler_get_bpsstring(_total_inputsize + _total_outputsize, elapse_time, bps_string);
+        GGMLHEXAGON_LOG_VERBOSE("I/O performance:%s", bps_string);
+    }
+
+    int profiler_get_frame_index() {
+        return _frame_index;
+    }
+
+    int profiler_get_threshold_count() {
+        return _profiler_threshold_counts;
+    }
+
+private:
+    void profiler_set_max_inputsize(int input_size) {
+        _max_inputsize = input_size;
+    }
+
+    void profiler_set_max_outputsize(int output_size) {
+        _max_outputsize = output_size;
+    }
+
+    void profiler_set_max_inferencetime(int inference_time) {
+        _max_inferencetime = inference_time;
+    }
+
+    void profiler_update_frameindex() {
+        if (0 == _frame_index) {
+            _profiler_starttime = ggml_time_us();
+        }
+        _frame_index += 1;
+    }
+
+    void profiler_update_elapsetime(int64_t elapse_time_microseconds) {
+        _profiler_elapsetime = elapse_time_microseconds;
+    }
+
+    void profiler_update_total_inferencetime(int inference_time) {
+        _total_inferencetime += inference_time;
+    }
+
+    void profiler_update_total_inputsize(int input_size) {
+        _total_inputsize += input_size;
+    }
+
+    void profiler_update_total_outputsize(int output_size) {
+        _total_outputsize += output_size;
+    }
+
+    int profiler_get_max_inputsize() {
+        return _max_inputsize;
+    }
+
+    int profiler_get_max_outputsize() {
+        return _max_outputsize;
+    }
+
+    int profiler_get_max_inferencetime() {
+        return _max_inferencetime;
+    }
+
+    int64_t profiler_get_total_inferencetime() {
+        return _total_inferencetime;
+    }
+
+    int64_t profiler_get_total_inputputsize() {
+        return _total_inputsize;
+    }
+
+    //might-be used to calculate total I/O performance in the future
+    int64_t profiler_get_total_outputsize() {
+        return _total_outputsize;
+    }
+
+    int64_t profiler_get_starttime() {
+        return _profiler_starttime;
+    }
+
+    int64_t profiler_get_elapsedtime() {
+        return _profiler_elapsetime;
+    }
+
+    void profiler_get_bpsstring(int64_t data_size, int64_t elapse_time_microseconds, char * bps_string) {
+        if (nullptr == bps_string) {
+            return;
+        }
+
+        float bps = 0.0f;
+        bps = (data_size * SIZE_IN_MB * 1.0f) / (elapse_time_microseconds * 1.0f);
+        if (bps >= SIZE_IN_MB) {
+            snprintf(bps_string, GGMLHEXAGON_TMPBUF_LEN, "%.2f MiB/s", ((float) bps) / SIZE_IN_MB);
+        } else if (bps >= 1000) {
+            snprintf(bps_string, GGMLHEXAGON_TMPBUF_LEN, "%.1f KiB/s", ((float) bps) / 1000);
+        } else {
+            snprintf(bps_string, GGMLHEXAGON_TMPBUF_LEN, "%.2f B/s", bps);
+        }
+    }
+
+    void reset() {
+        _frame_index         = 0;
+
+        _max_inputsize       = 0;
+        _max_outputsize      = 0;
+        _max_inferencetime   = 0;
+
+        _total_inputsize     = 0;
+        _total_outputsize    = 0;
+        _total_inferencetime = 0;
+
+        _profiler_starttime  = 0;
+        _profiler_elapsetime = 0;
+        _fp_profile_file     = nullptr;
+        _enable_profiler     = false;
+        _profiler_threshold_duration = 100;
+        _profiler_threshold_duration = 5;
+    }
+
+private:
+    hexagon_profiler() {
+        reset();
+    }
+
+    hexagon_profiler(const hexagon_profiler &) = delete;
+
+    hexagon_profiler(const hexagon_profiler &&) = delete;
+
+    hexagon_profiler & operator= (const hexagon_profiler &) = delete;
+
+private:
+    int _frame_index;
+
+    int _max_inputsize;             //bytes
+    int _max_outputsize;            //bytes
+    int _max_inferencetime;         //bytes
+
+    int64_t _total_inputsize;       //bytes
+    int64_t _total_outputsize;      //bytes
+    int64_t _total_inferencetime;   //microsecond
+
+    int64_t _profiler_starttime;    //microsecond
+    int64_t _profiler_elapsetime;   //microsecond
+    FILE *  _fp_profile_file;
+
+    bool _enable_profiler;
+    int  _profiler_threshold_duration; //seconds
+    int  _profiler_threshold_counts;
+};
+static hexagon_profiler & g_hexagon_profiler = hexagon_profiler::get_instance();
+
+//a simple perf class to probe NPU performance
 class hexagon_perf {
 public:
     hexagon_perf(const std::string & perf_name) : _perf_name(std::move(perf_name)) {}
-    hexagon_perf() = delete;
-    hexagon_perf(const hexagon_perf & ) = delete;
-    hexagon_perf & operator= (const hexagon_perf & ) = delete;
+    hexagon_perf(const std::string & perf_name, const char * op_name, int input_size, int output_size)
+               : _perf_name(std::move(perf_name)), _op_name(op_name),
+                 _input_size(input_size),
+                 _output_size(output_size) {
+
+    }
 
     void start() {
         if (0 == g_hexagon_appcfg.enable_perf)
@@ -897,22 +1182,46 @@ class hexagon_perf {
     }
 
     void info() {
-        if (0 == g_hexagon_appcfg.enable_perf)
+        if (0 == g_hexagon_appcfg.enable_perf) {
             return;
+        }
+
         _end_time = ggml_time_us();
         _duration = (_end_time - _begin_time);
-        GGMLHEXAGON_LOG_VERBOSE("duration of %s : %lld microseconds\n", _perf_name.c_str(), _duration);
+        //I think add following judgement will useful for other developers and AI experts although:
+        // it breaks the original logic
+        // it's not mandatory
+        // I had to expose two public function in hexagon_profiler class
+        if (g_hexagon_profiler.profiler_get_frame_index() <= g_hexagon_profiler.profiler_get_threshold_count()) {
+            GGMLHEXAGON_LOG_VERBOSE("inference duration of %s through %s: %lld microseconds",
+                                    _perf_name.c_str(), ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach), _duration);
+        }
+
+        //update profiler data
+        g_hexagon_profiler.profiler_update_profilerdata(_op_name, _duration, _input_size, _output_size);
     }
 
+private:
+    hexagon_perf() = delete;
+    hexagon_perf(const hexagon_perf & ) = delete;
+    hexagon_perf(const hexagon_perf && ) = delete;
+    hexagon_perf & operator= (const hexagon_perf & ) = delete;
+
 private:
     int64_t _begin_time = 0LL;
     int64_t _end_time   = 0LL;
     int64_t _duration   = 0LL;
     std::string _perf_name;
+    const char * _op_name;
+    int   _input_size   = 0;
+    int   _output_size  = 0;
 };
 
+//a simple class to load configurations from ggml-hexagon.cfg
 class hexagon_appcfg {
 public:
+    hexagon_appcfg() {}
+
     void dump(std::function<void(const std::string &, const std::string &, const std::string &)> worker) {
         if (!_load_success) {
             GGMLHEXAGON_LOG_INFO("qnn cfg file %s not loaded", _cfg_filename.c_str());
@@ -1045,6 +1354,12 @@ class hexagon_appcfg {
         trim(value);
         return true;
     }
+
+private:
+    hexagon_appcfg(const hexagon_appcfg & ) = delete;
+    hexagon_appcfg(const hexagon_appcfg && ) = delete;
+    hexagon_appcfg & operator= (const hexagon_appcfg & ) = delete;
+
 private:
     std::unordered_map<std::string, std::unordered_map<std::string, std::string>> _hexagon_appcfg;
     bool _load_success = false;
@@ -1441,30 +1756,36 @@ static void ggmlhexagon_load_cfg() {
     GGMLHEXAGON_LOG_DEBUG("program running start time:%s", time_string);
     std::string cfg_filename = std::string(g_hexagon_appcfg.runtime_libpath) + std::string(g_hexagon_appcfg.cfgfilename);
     GGMLHEXAGON_LOG_INFO("load hexagon appcfg from %s", cfg_filename.c_str());
-    hexagon_appcfg qnncfg_instance;
-    qnncfg_instance.load(cfg_filename);
-    qnncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
+    hexagon_appcfg hexagoncfg_instance;
+    hexagoncfg_instance.load(cfg_filename);
+    hexagoncfg_instance.dump([](const std::string & section, const std::string & key, const std::string value) {
         std::ostringstream  tmposs;
         tmposs << "section[" << std::setw(10) << std::left << section << "],[" << std::setw(25) << std::left << key << "] = [" << value << "]";
         GGMLHEXAGON_LOG_INFO("%s", tmposs.str().c_str());
     });
     std::string precision_mode;
     std::string ggml_hexagon_version;
-    qnncfg_instance.get_stringvalue("general", "ggml_hexagon_version", ggml_hexagon_version, "1.00");
-    qnncfg_instance.get_intvalue("general", "print_qnn_internal_log", g_hexagon_appcfg.print_qnn_internal_log, 0);
-    qnncfg_instance.get_intvalue("general", "enable_perf", g_hexagon_appcfg.enable_perf, 1);
-    qnncfg_instance.get_intvalue("general", "print_tensors_info", g_hexagon_appcfg.print_tensors_info, 0);
-    qnncfg_instance.get_intvalue("general", "dump_op_info", g_hexagon_appcfg.dump_op_info, 0);
-    qnncfg_instance.get_intvalue("general", "hwaccel_approach", g_hexagon_appcfg.hwaccel_approach, HWACCEL_CDSP);
-    qnncfg_instance.get_intvalue("general", "hexagon_backend", g_hexagon_appcfg.hexagon_backend, HEXAGON_BACKEND_CDSP);
-    qnncfg_instance.get_intvalue("general", "enable_q_mulmat", g_hexagon_appcfg.enable_q_mulmat, 0);
-    qnncfg_instance.get_intvalue("qnn", "hvx_threads", g_hexagon_appcfg.hvx_threads, 4);
-    qnncfg_instance.get_intvalue("qnn", "vtcm_size_in_mb", g_hexagon_appcfg.vtcm_size_in_mb, 8);
-    qnncfg_instance.get_intvalue("qnn", "enable_dlbc", g_hexagon_appcfg.enable_dlbc, 1);
-    qnncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32");
-    qnncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 1);
-    qnncfg_instance.get_intvalue("cdsp", "enable_rpc_dma_mempool", g_hexagon_appcfg.enable_rpc_dma_mempool, 0);
-    qnncfg_instance.get_intvalue("cdsp", "enable_all_q_mulmat", g_hexagon_appcfg.enable_all_q_mulmat, 0);
+    hexagoncfg_instance.get_stringvalue("general", "ggml_hexagon_version", ggml_hexagon_version, "1.00");
+    hexagoncfg_instance.get_intvalue("general", "enable_perf", g_hexagon_appcfg.enable_perf, 1);
+    hexagoncfg_instance.get_intvalue("general", "print_tensors_info", g_hexagon_appcfg.print_tensors_info, 0);
+    hexagoncfg_instance.get_intvalue("general", "dump_op_info", g_hexagon_appcfg.dump_op_info, 0);
+    hexagoncfg_instance.get_intvalue("general", "hwaccel_approach", g_hexagon_appcfg.hwaccel_approach, HWACCEL_CDSP);
+    hexagoncfg_instance.get_intvalue("general", "hexagon_backend", g_hexagon_appcfg.hexagon_backend, HEXAGON_BACKEND_CDSP);
+    hexagoncfg_instance.get_intvalue("general", "enable_q_mulmat", g_hexagon_appcfg.enable_q_mulmat, 0);
+    hexagoncfg_instance.get_intvalue("general", "enable_profiler", g_hexagon_appcfg.enable_profiler, 0);
+    hexagoncfg_instance.get_intvalue("general", "profiler_duration", g_hexagon_appcfg.profiler_duration, 5);
+    hexagoncfg_instance.get_intvalue("general", "profiler_counts", g_hexagon_appcfg.profiler_counts, 100);
+
+    hexagoncfg_instance.get_intvalue("qnn", "hvx_threads", g_hexagon_appcfg.hvx_threads, 4);
+    hexagoncfg_instance.get_intvalue("qnn", "vtcm_size_in_mb", g_hexagon_appcfg.vtcm_size_in_mb, 8);
+    hexagoncfg_instance.get_intvalue("qnn", "enable_dlbc", g_hexagon_appcfg.enable_dlbc, 1);
+    hexagoncfg_instance.get_stringvalue("qnn", "precision_mode", precision_mode, "fp32");
+    hexagoncfg_instance.get_intvalue("qnn", "print_qnn_internal_log", g_hexagon_appcfg.print_qnn_internal_log, 0);
+
+    hexagoncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 0);
+    hexagoncfg_instance.get_intvalue("cdsp", "enable_rpc_dma_mempool", g_hexagon_appcfg.enable_rpc_dma_mempool, 0);
+    hexagoncfg_instance.get_intvalue("cdsp", "enable_all_q_mulmat", g_hexagon_appcfg.enable_all_q_mulmat, 0);
+
     GGMLHEXAGON_LOG_INFO("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version);
     GGMLHEXAGON_LOG_INFO("internal ggml_dsp_version=%s", g_hexagon_appcfg.ggml_dsp_version);
     GGMLHEXAGON_LOG_INFO("external ggml_hexagon_version=%s", ggml_hexagon_version.c_str());
@@ -1473,6 +1794,8 @@ static void ggmlhexagon_load_cfg() {
     GGMLHEXAGON_LOG_INFO("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend,
                          ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend));
     GGMLHEXAGON_LOG_INFO("runtime libpath=%s", g_hexagon_appcfg.runtime_libpath);
+    GGMLHEXAGON_LOG_INFO("enable_perf=%d", g_hexagon_appcfg.enable_perf);
+    GGMLHEXAGON_LOG_INFO("enable_profiler=%d", g_hexagon_appcfg.enable_profiler);
 
     if (precision_mode.find("fp16") != std::string::npos) {
         g_hexagon_appcfg.precision_mode = 1;
@@ -1482,6 +1805,11 @@ static void ggmlhexagon_load_cfg() {
 
     ggmlhexagon_set_runtime_path(HEXAGON_BACKEND_CDSP, g_hexagon_appcfg.runtime_libpath);
 
+    if (1 == g_hexagon_appcfg.enable_profiler) {
+        //make sure this function is called only once
+        g_hexagon_profiler.profiler_init(g_hexagon_appcfg.profiler_duration, g_hexagon_appcfg.profiler_counts);
+    }
+
     initialized = true;
 }
 
@@ -1531,6 +1859,34 @@ static bool ggmlhexagon_check_valid_appcfg() {
     return is_valid_appcfg;
 }
 
+static void ggmlhexagon_probe_dspinfo(ggml_backend_hexagon_context * ctx);
+static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * ctx) {
+    char timestamp[GGMLHEXAGON_TMPBUF_LEN];
+    memset(timestamp, 0, GGMLHEXAGON_TMPBUF_LEN);
+
+    GGMLHEXAGON_LOG_INFO("ggml_hexagon_version:             %s", g_hexagon_appcfg.ggml_hexagon_version);
+    GGMLHEXAGON_LOG_INFO("ggml_dsp_version:                 %s", g_hexagon_appcfg.ggml_dsp_version);
+    GGMLHEXAGON_LOG_INFO("hwaccel approach:                 %d(%s)", g_hexagon_appcfg.hwaccel_approach,
+                         ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
+    GGMLHEXAGON_LOG_INFO("hexagon_backend:                  %d(%s)", g_hexagon_appcfg.hexagon_backend,
+                         ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend));
+    ggmlhexagon_get_timestring(timestamp);
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
+        GGMLHEXAGON_LOG_INFO("using rpc ion memory pool:        %s", g_hexagon_appcfg.enable_rpc_ion_mempool ? "YES" : "NO");
+        GGMLHEXAGON_LOG_INFO("using rpc dma memory pool:        %s", g_hexagon_appcfg.enable_rpc_dma_mempool ? "YES" : "NO");
+        ggmlhexagon_probe_dspinfo(ctx);
+    } else {
+        GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
+    }
+    GGMLHEXAGON_LOG_INFO("running timestamp:%s", timestamp);
+
+    if (1 == g_hexagon_appcfg.enable_profiler) {
+        //make sure this function is called only once
+        g_hexagon_profiler.profiler_deinit();
+    }
+}
+
 // =================================================================================================
 //  section-5: QNN helper function/class
 // =================================================================================================
@@ -2350,8 +2706,6 @@ void qnn_instance::free_rpcmem(void * buf) {
         }
         if (rpcbuffer_size != 0) {
             _rpcmem_usage_map.erase(buf);
-        } else {
-            GGMLHEXAGON_LOG_WARN("it shouldn't happen, pls check why?");
         }
         _pfn_rpc_mem_free(_rpcmem_store_map[buf]);
         _rpcmem_store_map.erase(buf);
@@ -3568,13 +3922,17 @@ static void ggmlqnn_compute_elementwise(ggml_backend_hexagon_context * ctx, ggml
     size_t qnn_op_index                         = ggmlhexagon_get_op_index(op);
     const char * qnn_op_name                    = ggmlqnn_k_op_caps[qnn_op_index].qnn_op_name;
     size_t input_param_count                    = ggmlqnn_k_op_caps[qnn_op_index].input_param_count;
-    std::string ggml_op_name_string             = std::string("ggml_") + ggml_op_name(op->op);
+    const char * ggml_original_opname           = ggml_op_name(op->op);
+    std::string ggml_op_name_string             = std::string("ggml_") + ggml_original_opname;
     const char * ggml_op_name                   = ggml_op_name_string.c_str();
 
     std::string graph_name;
     ggmlhexagon_get_opkey_from_op(op, graph_name);
 
-    hexagon_perf op_perf(graph_name);
+    int input_size = ggml_nbytes(src0);
+    if (nullptr != src1)
+        input_size += ggml_nbytes(src1);
+    hexagon_perf op_perf(graph_name, ggml_original_opname, input_size, ggml_nbytes(dst));
     op_perf.start();
 
     bool enable_npu_rpc = instance->enable_qnn_rpc() && ctx->device == HEXAGON_BACKEND_QNNNPU;
@@ -3953,13 +4311,16 @@ static void ggmlqnn_compute_mul_mat(ggml_backend_hexagon_context * ctx, ggml_ten
     const enum ggml_type src0_type              = src0->type;
     const uint32_t src0_rank                    = ggml_n_dims(src0);
     const uint32_t src1_rank                    = ggml_n_dims(src1);
-
+    const char * ggml_original_opname           = ggml_op_name(op->op);
     ggmlhexagon_print_tensors_info(__func__, ctx, src0, src1, dst);
 
     std::string graph_name;
     ggmlhexagon_get_opkey_from_op(op, graph_name);
 
-    hexagon_perf op_perf(graph_name);
+    int input_size = ggml_nbytes(src0);
+    if (nullptr != src1)
+        input_size += ggml_nbytes(src1);
+    hexagon_perf op_perf(graph_name, ggml_original_opname, input_size, ggml_nbytes(dst));
     op_perf.start();
 
     GGML_ASSERT(src0_rank == src1_rank);
@@ -5001,11 +5362,9 @@ static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_
     struct dsptensor dsptensor_1;
     struct dsptensor dsptensor_2;
     std::string op_name;
+    const char * ggml_opname = ggml_op_name(op->op);
     ggmlhexagon_get_opkey_from_op(op, op_name);
 
-    hexagon_perf op_perf(op_name);
-    op_perf.start();
-
     int hexagon_error               = AEE_SUCCESS;
     ggmlhexagon_op_func_t op_func   = nullptr;
     size_t input_tensor_count       = 2;
@@ -5014,6 +5373,12 @@ static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_
     ggml_tensor * src1  = op->src[1];
     ggml_tensor * dst   = op;
 
+    int input_size = ggml_nbytes(src0);
+    if (nullptr != src1)
+        input_size += ggml_nbytes(src1);
+    hexagon_perf op_perf(op_name, ggml_opname, input_size, ggml_nbytes(dst));
+    op_perf.start();
+
     input_tensor_count  =  ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op)].input_param_count;
     op_func             =  ggmlhexagon_k_op_caps[ggmlhexagon_get_op_index(op)].dsp_op_func;
     if (nullptr == op_func) {
@@ -5096,9 +5461,13 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
         return false;
     }
 
-    const struct ggml_tensor * src0 = op_tensor->src[0];
-    const struct ggml_tensor * src1 = op_tensor->src[1];
-    const int src0_rank = ggml_n_dims(src0);
+    const ggml_tensor * src0 = op_tensor->src[0];
+    const ggml_tensor * src1 = op_tensor->src[1];
+    const int src0_rank      = ggml_n_dims(src0);
+    int src1_rank            = 0;
+    if (nullptr != src1) {
+        src1_rank = ggml_n_dims(src1);
+    }
     switch (op_tensor->op) {
         case GGML_OP_ADD:
         {
@@ -5110,10 +5479,13 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
         case GGML_OP_MUL_MAT:
         {
             ggmlhexagon_dump_op_info(op_tensor);
-            //FIXME:remove this filter in the future
-            if (2 != src0_rank) {
+            //FIXME:keep same filter logic with QNN solution to compare NPU performance between cDSP approach
+            //      and QNN-NPU approach, remove these filters in the future
+            if (src0_rank != src1_rank)
                 return false;
-            }
+            if (src0_rank != 2)
+                return false;
+
             if (1 == g_hexagon_appcfg.enable_q_mulmat) {
                 if (1 == g_hexagon_appcfg.enable_all_q_mulmat) {
                     return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && (src1->type == GGML_TYPE_F32);
@@ -5199,14 +5571,12 @@ static bool ggmlhexagon_can_handle_op_through_qnn(ggml_backend_dev_t dev, const
             if (src0_rank != src1_rank) // make QNN SDK happy
                 return false;
 
-            if (src0_rank < 2) // QNN's limitation, make QNN SDK happy
-                return false;
-
-            if (4 == src0_rank) //TODO: 4D matrix mulmat in CT
-                return false;
-
-            if ((src1->ne[2] != src0->ne[2]) || (src1->ne[3] != src0->ne[3])) // make QNN SDK happy
+            if (src0_rank != 2) {
+                // FIXME: there are some limitations for mulmat in QNN SDK: rank >= 2.
+                //        keep same filter logic with QNN solution to compare NPU performance between
+                //        cDSP approach and QNN-NPU approach, remove these filters in the future
                 return false;
+            }
 
             if (ctx->device == HEXAGON_BACKEND_QNNNPU) {
                 if (1 == g_hexagon_appcfg.enable_q_mulmat)
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
index 38f80b398bfb6..ad7cc77e15f9d 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -803,10 +803,10 @@ static void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRIC
 static void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
     assert(n % QK_K == 0);
     assert(nrc == 1);
-            UNUSED(nrc);
-            UNUSED(bx);
-            UNUSED(by);
-            UNUSED(bs);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
 
     const block_q6_K * GGML_RESTRICT x = vx;
     const block_q8_K * GGML_RESTRICT y = vy;
@@ -1061,6 +1061,9 @@ static void ggml_compute_forward_add_f32(
     uint64_t end_time = ggml_time_us();
     uint64_t duration = (end_time - start_time);
     GGMLHEXAGON_LOG_DEBUG("duration %llu us", duration);
+#if !GGMLHEXAGON_DEBUG
+    UNUSED(duration);
+#endif
 
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
 }
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 3d9ec2901756b..7515894204ca1 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -124,7 +124,7 @@ function check_and_download_ndk()
 
 function build_arm64
 {
-    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_HEXAGON=ON -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION}
+    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Release -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_HEXAGON=ON -DLLAMA_CURL=OFF -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION}
     cd out/android
     make -j16
     show_pwd
@@ -134,7 +134,7 @@ function build_arm64
 
 function build_arm64_debug
 {
-    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Debug -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_HEXAGON=ON -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION}
+    cmake -H. -B./out/android -DCMAKE_BUILD_TYPE=Debug -DGGML_OPENMP=OFF -DCMAKE_TOOLCHAIN_FILE=${ANDROID_NDK}/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=latest -DCMAKE_C_FLAGS=-march=armv8.7-a -DGGML_HEXAGON=ON -DLLAMA_CURL=OFF -DQNN_SDK_PATH=${QNN_SDK_PATH} -DHEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} -DHTP_ARCH_VERSION=${HTP_ARCH_VERSION}
     cd out/android
     make -j16
     show_pwd
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index 462ab09fbdc81..fc356e281bed8 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -1,6 +1,29 @@
+#
+# Copyright (c) 2023-2025 The ggml authors
+#
+#  Permission is hereby granted, free of charge, to any person obtaining a copy
+#  of this software and associated documentation files (the "Software"), to
+#  deal in the Software without restriction, including without limitation the
+#  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+#  sell copies of the Software, and to permit persons to whom the Software is
+#  furnished to do so, subject to the following conditions:
+#
+#  The above copyright notice and this permission notice shall be included in
+#  all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+#  IN THE SOFTWARE.
+#
+#  runtime configuration for ggml-hexagon backend
+#
 [general]
 #version of ggml-hexagon.cpp on ARM-AP side
-version = "1.01"
+version = "1.80"
 #version of ggml-dsp.c on cDSP side
 ggmldsp_version = "0.60"
 
@@ -8,42 +31,75 @@ ggmldsp_version = "0.60"
 #1: HEXAGON_BACKEND_QNNGPU
 #2: HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP
 #3: default ggml backend
-hexagon_backend = 2
+hexagon_backend  = 2
+# 0: hwaccel approach through HWACCEL_QNN: offload ggml op to QNN
+# 1: hwaccel approach through HWACCEL_QNN_SINGLEGRAPH: mapping entire ggml cgraph to a single QNN graph
+# 2: hwaccel approach through HWACCEL_CDSP:offload ggml op to cDSP directly
+hwaccel_approach = 2
+#
+#attention:
+#          a. HWACCEL_QNN_SINGLEGRAPH not supported at the moment;
+#          b. following combinations are valid:
+#             1: hwaccel_approach = 2 AND hexagon_backend = 2(this is the default setting)
+#             2: hwaccel_approach = 0 AND hexagon_backend = 2(QNNNPU)
+#             3: hwaccel_approach = 0 AND hexagon_backend = 1(QNNGPU)
+#             4: hwaccel_approach = 0 AND hexagon_backend = 0(QNNCPU)
+#             5: hwaccel_approach = 2 AND hexagon_backend = 3
+#             6: hwaccel_approach = 0 AND hexagon_backend = 3
+#
+#generally speaking, we only need to focus on b-1 and b-2 in this PR.
+
+
+#enable/disable offload quantized type mulmat
+#quatized type mulmat works fine through HWACCEL_QNN at the moment
+#quatized type mulmat doesn't works fine through HWACCEL_CDSP at the moment
+#this item will make mulmat performance comprision easily
+enable_q_mulmat = 0
+
 
-# enable/disable QNN's internal log
-print_qnn_internal_log = 0
-# enable/disable perf of op function
-enable_perf = 1
 # enable/disable print tensors info in op function
 print_tensors_info = 0
 # enable/disable dump op info in handle_op
 dump_op_info = 0
 
-#enable/disable offload quantized type mulmat
-#quatized type mulmat works fine in HWACCEL_QNN at the moment
-#quatized type mulmat doesn't works fine in HWACCEL_CDSP at the moment
-#this item will make mulmat performance comprision easily
-enable_q_mulmat = 0
 
-# 0: hwaccel approach through HWACCEL_QNN: offload ggml op to QNN
-# 1: hwaccel approach through HWACCEL_QNN_SINGLEGRAPH: mapping entire ggml cgraph to a single QNN graph
-# 2: hwaccel approach through HWACCEL_CDSP:offload ggml op to cDSP directly
-# HWACCEL_QNN_SINGLEGRAPH not supported at the moment
-hwaccel_approach = 2
+# enable/disable perf of op function
+# this is the default setting
+enable_perf = 1
+
 
-#hwaccel approach through QNN
+# enablie/disable profiler feature to visually compare NPU performance between HWACCEL_CDSP and HWACCEL_QNN
+# this is not default setting but useful before PR in upstream can be approved
+enable_profiler = 1
+#threshold duration of NPU performance profiler, per seconds
+profiler_duration = 5
+#threshold counst of NPU performance profiler
+profiler_counts = 200
+#attention:
+#          NPU performance might be slower when enable_profiler = 1 because of file I/O in this feature;
+#          ensure enable_perf = 1 when set enable_profiler = 1;
+
+
+#hwaccel approach through QNN(offload ggml op to QNN-NPU)
 [qnn]
+# enable/disable QNN SDK's internal log, this will very helpful for troubleshooting in HWACCEL_QNN approach
+print_qnn_internal_log = 0
+
 hvx_threads = 4
 vtcm_size_in_mb = 8
 enable_dlbc = 1
 precision_mode = "fp16"
 
-#hwaccel approach through cDSP
+
+#hwaccel approach through cDSP(offload ggml op to Hexagon cDSP directly)
 [cdsp]
 #enable/disable rpc ion memory pool
 enable_rpc_ion_mempool = 0
+
 #enable/disable rpc dma memory pool
 enable_rpc_dma_mempool = 0
+
 #enable/disable offload all quantized type mulmat to cDSP
-#ensure enable_q_mulmat already be setting to 1
 enable_all_q_mulmat = 0
+#attention:
+#ensure enable_q_mulmat = 1 when set enable_all_q_mulmat = 1

From 4e31ae2ee5c23d483c23f7c8e0cd2ce3de507bae Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 8 Apr 2025 11:22:40 +0800
Subject: [PATCH 171/200] ggml-hexagon: remove so-called dma memory pool to
 avoid confusion and ambiguity

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 24 ++----------------------
 scripts/ggml-hexagon.cfg               |  3 ---
 2 files changed, 2 insertions(+), 25 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 208024258f50b..c3fa1efa61a1b 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -322,7 +322,6 @@ struct hexagon_appcfg_t {
     int hwaccel_approach;       // 0: HWACCEL_QNN 1: HWACCEL_QNN_SINGLEGRAPH 2: HWACCEL_CDSP
     int hexagon_backend;        // 0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP
     int enable_rpc_ion_mempool; // enable/disable rpc ion memory pool
-    int enable_rpc_dma_mempool; // enable/disable rpc dma memory pool
     int enable_all_q_mulmat;    // enable/disable offload all quantized type mulmat to cDSP
     int profiler_duration;      // threshold of duration in profiler, per seconds
     int profiler_counts;        // threshold of counts in profiler
@@ -346,7 +345,6 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
         .hwaccel_approach       = HWACCEL_CDSP,
         .hexagon_backend        = HEXAGON_BACKEND_CDSP,
         .enable_rpc_ion_mempool = 0,
-        .enable_rpc_dma_mempool = 0,
         .enable_all_q_mulmat    = 0,
         .profiler_duration      = 5,
         .profiler_counts        = 100,
@@ -1783,7 +1781,6 @@ static void ggmlhexagon_load_cfg() {
     hexagoncfg_instance.get_intvalue("qnn", "print_qnn_internal_log", g_hexagon_appcfg.print_qnn_internal_log, 0);
 
     hexagoncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 0);
-    hexagoncfg_instance.get_intvalue("cdsp", "enable_rpc_dma_mempool", g_hexagon_appcfg.enable_rpc_dma_mempool, 0);
     hexagoncfg_instance.get_intvalue("cdsp", "enable_all_q_mulmat", g_hexagon_appcfg.enable_all_q_mulmat, 0);
 
     GGMLHEXAGON_LOG_INFO("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version);
@@ -1835,16 +1832,6 @@ static bool ggmlhexagon_check_valid_appcfg() {
             is_valid_appcfg = false;
         }
 
-        if ((1 == g_hexagon_appcfg.enable_rpc_ion_mempool) && (1 == g_hexagon_appcfg.enable_rpc_dma_mempool)) {
-            GGMLHEXAGON_LOG_INFO("rpc ion mempool and rpc dma mempool cannot be enabled at the same time");
-            is_valid_appcfg = false;
-        }
-
-        if (1 == g_hexagon_appcfg.enable_rpc_dma_mempool) {
-            GGMLHEXAGON_LOG_INFO("rpc dma mempool not supported");
-            is_valid_appcfg = false;
-        }
-
         if (1 == g_hexagon_appcfg.enable_all_q_mulmat) {
             if (0 == g_hexagon_appcfg.enable_q_mulmat) {
                 GGMLHEXAGON_LOG_INFO("ensure set enable_q_mulmat to 1 firstly when set enable_all_q_mulmat to 1");
@@ -1874,7 +1861,6 @@ static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * c
     if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
         GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
         GGMLHEXAGON_LOG_INFO("using rpc ion memory pool:        %s", g_hexagon_appcfg.enable_rpc_ion_mempool ? "YES" : "NO");
-        GGMLHEXAGON_LOG_INFO("using rpc dma memory pool:        %s", g_hexagon_appcfg.enable_rpc_dma_mempool ? "YES" : "NO");
         ggmlhexagon_probe_dspinfo(ctx);
     } else {
         GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
@@ -5107,7 +5093,7 @@ static void ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
             GGMLHEXAGON_LOG_WARN("rpc mempool is too big");
             return;
         }
-        //FIXME: it seems there is unknown issue with DMA memory pool
+        //FIXME: it seems there is unknown issue with another ION memory pool
         ctx->rpc_mempool = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS,
                                         ctx->rpc_mempool_len);
         if (nullptr == ctx->rpc_mempool) {
@@ -5123,10 +5109,6 @@ static void ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
         remote_register_buf(ctx->rpc_mempool, ctx->rpc_mempool_len, ctx->rpc_mempool_handle);
     }
 
-    if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_dma_mempool)) {
-        //TODO
-    }
-
     return;
 }
 
@@ -5833,9 +5815,7 @@ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_ty
     if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
         return "hexagon-ion-buffer";
     }
-    if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_dma_mempool)) {
-        return "hexagon-dma-buffer";
-    }
+
     return "hexagon-normal-buffer";
 }
 
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index fc356e281bed8..30e817e5e32d5 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -96,9 +96,6 @@ precision_mode = "fp16"
 #enable/disable rpc ion memory pool
 enable_rpc_ion_mempool = 0
 
-#enable/disable rpc dma memory pool
-enable_rpc_dma_mempool = 0
-
 #enable/disable offload all quantized type mulmat to cDSP
 enable_all_q_mulmat = 0
 #attention:

From abe685730b0324146a0571f648c8c52339fbfe75 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 8 Apr 2025 14:09:01 +0800
Subject: [PATCH 172/200] ggml-hexagon: make function
 ggmlhexagon_init_rpcmempool in ggml-hexagon.cpp more robust

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index c3fa1efa61a1b..17f52d5f867b5 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -5065,11 +5065,15 @@ static int ggmlhexagon_request_status_notifications(int domain_id, void * contex
     return hexagon_error;
 }
 
-static void ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
+static int ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
     size_t candidate_size   = 0;
     uint8_t * rpc_buffer    = nullptr;
     size_t probe_slots[]    = {1024, 1536, 2000, 2048};
     size_t probe_counts     = sizeof(probe_slots) / sizeof(size_t);
+
+    if (nullptr == ctx)
+        return 1;
+
     for (size_t idx = 0; idx < probe_counts; idx++) {
         rpc_buffer = static_cast<uint8_t *>(rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB)));
         if (nullptr == rpc_buffer) {
@@ -5087,18 +5091,15 @@ static void ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
     GGMLHEXAGON_LOG_INFO("capacity of rpc memory %d MiB", ctx->rpc_mempool_capacity / SIZE_IN_MB);
 
     if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
-        //FIXME: reasonable rpc memory pool size through a better approach rather than hardcoded size
-        ctx->rpc_mempool_len = 1024 * SIZE_IN_MB;
-        if (ctx->rpc_mempool_len > ctx->rpc_mempool_capacity) {
-            GGMLHEXAGON_LOG_WARN("rpc mempool is too big");
-            return;
-        }
+        GGML_ASSERT(ctx->rpc_mempool_capacity > (8 * SIZE_IN_MB));
+        ctx->rpc_mempool_len = ctx->rpc_mempool_capacity - (8 * SIZE_IN_MB);
+
         //FIXME: it seems there is unknown issue with another ION memory pool
         ctx->rpc_mempool = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS,
                                         ctx->rpc_mempool_len);
         if (nullptr == ctx->rpc_mempool) {
             GGMLHEXAGON_LOG_WARN("alloc rpc memorypool %d failed", ctx->rpc_mempool_len);
-            return;
+            return 2;
         } else {
             GGMLHEXAGON_LOG_DEBUG("alloc rpc memorypool %p successfully %ld(%d MiB)",
                                   ctx->rpc_mempool, ctx->rpc_mempool_len,
@@ -5109,7 +5110,7 @@ static void ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
         remote_register_buf(ctx->rpc_mempool, ctx->rpc_mempool_len, ctx->rpc_mempool_handle);
     }
 
-    return;
+    return 0;
 }
 
 static void ggmlhexagon_deinit_rpcmempool(ggml_backend_hexagon_context * ctx) {
@@ -5316,7 +5317,11 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
         ggmlhexagon_probe_dspinfo(ctx);
         ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1);
         ggmlhexagon_set_rpc_latency(ctx->ggmlop_handle, RPC_POLL_QOS, 100);
-        ggmlhexagon_init_rpcmempool(ctx);
+        int result = ggmlhexagon_init_rpcmempool(ctx);
+        if (0 != result) {
+            GGMLHEXAGON_LOG_INFO("failed to init rpc mempool");
+            goto bail;
+        }
     } else {
         GGMLHEXAGON_LOG_INFO("error 0x%x: failed to open domain %d(%s)", hexagon_error, domain_id,
                              ggmlhexagon_get_dsp_name(domain_id));

From 1bfae3594a61977a03092c8bddf94432d1f83530 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 8 Apr 2025 15:21:36 +0800
Subject: [PATCH 173/200] ggml-hexagon: fix potential resource leak in class
 hexagon_profiler

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 17f52d5f867b5..a0dfd1a12db84 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -909,7 +909,7 @@ class hexagon_profiler {
             written_size = fwrite(prefix, 1, strlen(prefix), _fp_profile_file);
             if (written_size != strlen(prefix)) {
                 GGMLHEXAGON_LOG_WARN("write data to file %s failed, reason: %s", profiler_filename, strerror(errno));
-                reset();
+                profiler_deinit();
                 return;
             }
 
@@ -918,7 +918,7 @@ class hexagon_profiler {
             written_size = fwrite(profiler_info, 1, strlen(profiler_info), _fp_profile_file);
             if (written_size != strlen(profiler_info)) {
                 GGMLHEXAGON_LOG_WARN("write data to file %s failed, reason: %s", profiler_filename, strerror(errno));
-                reset();
+                profiler_deinit();
                 return;
             }
             fprintf(_fp_profile_file, "\n\n");

From 200dae8e8fe0aa80755df6fc671a2f210e1165c4 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 8 Apr 2025 21:16:28 +0800
Subject: [PATCH 174/200] ggml-hexagon: enable multi-threading feature on cDSP
 side

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp        | 25 ++++++++++++++-----
 ggml/src/ggml-hexagon/kernels/ggml-dsp.c      | 22 ++++++++++++++--
 .../src/ggml-hexagon/kernels/ggmlop_ap_skel.c |  9 ++++---
 .../src/ggml-hexagon/kernels/ggmlop_ap_skel.h |  2 +-
 .../ggml-hexagon/kernels/ggmlop_cdsp_skel.c   |  6 +++--
 scripts/ggml-hexagon.cfg                      | 12 +++++++--
 6 files changed, 59 insertions(+), 17 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index a0dfd1a12db84..e9be4b4fec250 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -325,6 +325,7 @@ struct hexagon_appcfg_t {
     int enable_all_q_mulmat;    // enable/disable offload all quantized type mulmat to cDSP
     int profiler_duration;      // threshold of duration in profiler, per seconds
     int profiler_counts;        // threshold of counts in profiler
+    int thread_counts;          // thread_counts on cDSP side
     const char * cfgfilename;
     const char * runtime_libpath;
     char ggml_hexagon_version[GGMLHEXAGON_TMPBUF_LEN];
@@ -348,6 +349,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
         .enable_all_q_mulmat    = 0,
         .profiler_duration      = 5,
         .profiler_counts        = 100,
+        .thread_counts          = 4,
         .cfgfilename            = "ggml-hexagon.cfg",
 #if defined(__ANDROID__)
 //Android command line program
@@ -357,8 +359,8 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
 #elif defined(_WIN32)
         .qnn_runtimelib_path    = "C:\\",
 #endif
-        .ggml_hexagon_version   = {"1.80"},
-        .ggml_dsp_version       = {"0.60"},
+        .ggml_hexagon_version   = {"1.81"},
+        .ggml_dsp_version       = {"0.61"},
 };
 
 //file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
@@ -886,10 +888,19 @@ class hexagon_profiler {
         //FIXME:hardcode filename of profiler data
         std::string filename = std::string(g_hexagon_appcfg.runtime_libpath) + "/";
         if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
-            if (0 == g_hexagon_appcfg.enable_rpc_ion_mempool) {
-                filename = filename + "hexagon_perf_cdsp.dat";
+            if (g_hexagon_appcfg.thread_counts > 1) {
+                //multi-threading feature enabled on cDSP side
+                if (0 == g_hexagon_appcfg.enable_rpc_ion_mempool) {
+                    filename = filename + "hexagon_perf_cdsp_mt.dat";
+                } else {
+                    filename = filename + "hexagon_perf_cdsp_ion_mt.dat";
+                }
             } else {
-                filename = filename + "hexagon_perf_cdsp_ion.dat";
+                if (0 == g_hexagon_appcfg.enable_rpc_ion_mempool) {
+                    filename = filename + "hexagon_perf_cdsp.dat";
+                } else {
+                    filename = filename + "hexagon_perf_cdsp_ion.dat";
+                }
             }
         } else {
             filename = filename + "hexagon_perf_qnn.dat";
@@ -1782,6 +1793,7 @@ static void ggmlhexagon_load_cfg() {
 
     hexagoncfg_instance.get_intvalue("cdsp", "enable_rpc_ion_mempool", g_hexagon_appcfg.enable_rpc_ion_mempool, 0);
     hexagoncfg_instance.get_intvalue("cdsp", "enable_all_q_mulmat", g_hexagon_appcfg.enable_all_q_mulmat, 0);
+    hexagoncfg_instance.get_intvalue("cdsp", "thread_counts", g_hexagon_appcfg.thread_counts, 4);
 
     GGMLHEXAGON_LOG_INFO("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version);
     GGMLHEXAGON_LOG_INFO("internal ggml_dsp_version=%s", g_hexagon_appcfg.ggml_dsp_version);
@@ -5315,7 +5327,8 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
         //FIXME: only support offload fp32 GGML_OP_MUL_MAT to cDSP
         GGMLHEXAGON_LOG_INFO("only support offload fp32 GGML_OP_ADD and fp32 GGML_OP_MUL_MAT to cDSP currently");
         ggmlhexagon_probe_dspinfo(ctx);
-        ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1);
+        //FIXME: re-use this function to pass thread_counts info to code on cDSP side before fully understand qidl mechanism
+        ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1, g_hexagon_appcfg.thread_counts);
         ggmlhexagon_set_rpc_latency(ctx->ggmlop_handle, RPC_POLL_QOS, 100);
         int result = ggmlhexagon_init_rpcmempool(ctx);
         if (0 != result) {
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
index ad7cc77e15f9d..5eaf714df3cbc 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -876,6 +876,7 @@ static int64_t ggml_time_us(void) {
 // =================================================================================================
 //  section-4: ggml-hexagon kernel helper function
 // =================================================================================================
+static int32 g_thread_counts = 1;
 int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
     void *tptr = NULL;
     FARF(HIGH, "uri %s", uri);
@@ -897,13 +898,15 @@ int ggmlop_dsp_close(remote_handle64 handle) {
     return 0;
 }
 
-AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled) {
+AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled, int32 thread_counts) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
     HAP_power_request_t request;
     memset(&request, 0, sizeof(HAP_power_request_t));
     request.type = HAP_power_set_apptype;
     request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
 
+    g_thread_counts = thread_counts;
+
     void * ggmop_ctx = (void*)(handle);
     int retval = HAP_power_set(ggmop_ctx, &request);
     if (retval)  {
@@ -1192,7 +1195,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
 }
 
 //FIXME: only support fp32 mulmat on cDSP
-int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
     ggmlhexagon_dump_tensor(src0, 0);
     ggmlhexagon_dump_tensor(src1, 0);
@@ -1353,6 +1356,21 @@ int ggmlop_dsp_mulmat(remote_handle64 h, const ggml_tensor * src0, const ggml_te
     return 0;
 }
 
+int ggmlop_dsp_mulmat_multithread(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+int ggmlop_dsp_mulmat(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) {
+    if (g_thread_counts > 1) {
+        return ggmlop_dsp_mulmat_multithread(h, src0, src1, dst);
+    } else {
+        return ggmlop_dsp_mulmat_singlethread(h, src0, src1, dst);
+    }
+    return 0;
+}
+
 int ggmlop_dsp_softmax(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
 
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
index 025735aad8cbf..b0a660ce96a79 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
+++ b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
@@ -296,24 +296,25 @@ __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_open)(const char* uri, remote_hand
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_close)(remote_handle64 h) __QAIC_STUB_ATTRIBUTE {
    return __QAIC_REMOTE(remote_handle64_close)(h);
 }
-static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint32_t _in0[1], uint32_t _in1[1], uint32_t _in2[1]) {
+static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint32_t _in0[1], uint32_t _in1[1], uint32_t _in2[1], uint32_t _in3[1]) {
    remote_arg _pra[1] = {0};
-   uint32_t _primIn[3]= {0};
+   uint32_t _primIn[4]= {0};
    int _nErr = 0;
    _pra[0].buf.pv = (void*)_primIn;
    _pra[0].buf.nLen = sizeof(_primIn);
    _COPY(_primIn, 0, _in0, 0, 4);
    _COPY(_primIn, 4, _in1, 0, 4);
    _COPY(_primIn, 8, _in2, 0, 4);
+   _COPY(_primIn, 12,_in3, 0, 4);
    _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _pra));
    _CATCH_FARF(_nErr) {
       _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _mid, __func__);
    }
    return _nErr;
 }
-__QAIC_STUB_EXPORT AEEResult __QAIC_STUB(ggmlop_dsp_setclocks)(remote_handle64 _handle, int32 power_level, int32 latency, int32 dcvs_enable) __QAIC_STUB_ATTRIBUTE {
+__QAIC_STUB_EXPORT AEEResult __QAIC_STUB(ggmlop_dsp_setclocks)(remote_handle64 _handle, int32 power_level, int32 latency, int32 dcvs_enable, int32 threads) __QAIC_STUB_ATTRIBUTE {
    uint32_t _mid = 2;
-   return _stub_method(_handle, _mid, (uint32_t*)&power_level, (uint32_t*)&latency, (uint32_t*)&dcvs_enable);
+   return _stub_method(_handle, _mid, (uint32_t*)&power_level, (uint32_t*)&latency, (uint32_t*)&dcvs_enable, (uint32_t*)&threads);
 }
 static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
    int _nErr = 0;
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h
index b048988410dc8..f189c48d0238b 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h
+++ b/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h
@@ -272,7 +272,7 @@ __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_open)(const char* uri, remote_
     * @retval, 0 on success, should always succeed
     */
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_close)(remote_handle64 h) __QAIC_HEADER_ATTRIBUTE;
-__QAIC_HEADER_EXPORT AEEResult __QAIC_HEADER(ggmlop_dsp_setclocks)(remote_handle64 _h, int32 power_level, int32 latency, int32 dcvs_enable) __QAIC_HEADER_ATTRIBUTE;
+__QAIC_HEADER_EXPORT AEEResult __QAIC_HEADER(ggmlop_dsp_setclocks)(remote_handle64 _h, int32 power_level, int32 latency, int32 dcvs_enable, int32 threads) __QAIC_HEADER_ATTRIBUTE;
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_add)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_mulmat)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_softmax)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c
index 7cce9d050f3fb..1e9d31a72319d 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c
+++ b/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c
@@ -397,11 +397,12 @@ static __inline int _skel_method(int (*_pfn)(remote_handle64, const dsptensor*,
    _allocator_deinit(_al);
    return _nErr;
 }
-static __inline int _skel_method_1(int (*_pfn)(remote_handle64, int32, int32, int32), remote_handle64 _h, uint32_t _sc, remote_arg* _pra) {
+static __inline int _skel_method_1(int (*_pfn)(remote_handle64, int32, int32, int32, int32), remote_handle64 _h, uint32_t _sc, remote_arg* _pra) {
    remote_arg* _praEnd = 0;
    uint32_t _in0[1] = {0};
    uint32_t _in1[1] = {0};
    uint32_t _in2[1] = {0};
+   uint32_t _in3[1] = {0};
    uint32_t* _primIn= 0;
    int _nErr = 0;
    _praEnd = ((_pra + REMOTE_SCALARS_INBUFS(_sc)) + REMOTE_SCALARS_OUTBUFS(_sc) + REMOTE_SCALARS_INHANDLES(_sc) + REMOTE_SCALARS_OUTHANDLES(_sc));
@@ -415,7 +416,8 @@ static __inline int _skel_method_1(int (*_pfn)(remote_handle64, int32, int32, in
    _COPY(_in0, 0, _primIn, 0, 4);
    _COPY(_in1, 0, _primIn, 4, 4);
    _COPY(_in2, 0, _primIn, 8, 4);
-   _TRY(_nErr, _pfn(_h, (int32)*_in0, (int32)*_in1, (int32)*_in2));
+   _COPY(_in3, 0, _primIn, 12, 4);
+   _TRY(_nErr, _pfn(_h, (int32)*_in0, (int32)*_in1, (int32)*_in2, (int32)*_in3));
    _QAIC_CATCH(_nErr) {}
    return _nErr;
 }
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index 30e817e5e32d5..b5afc14c355cc 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -23,9 +23,9 @@
 #
 [general]
 #version of ggml-hexagon.cpp on ARM-AP side
-version = "1.80"
+version = "1.81"
 #version of ggml-dsp.c on cDSP side
-ggmldsp_version = "0.60"
+ggmldsp_version = "0.61"
 
 #0: HEXAGON_BACKEND_QNNCPU
 #1: HEXAGON_BACKEND_QNNGPU
@@ -100,3 +100,11 @@ enable_rpc_ion_mempool = 0
 enable_all_q_mulmat = 0
 #attention:
 #ensure enable_q_mulmat = 1 when set enable_all_q_mulmat = 1
+
+#enable/disable multi-threading on cDSP side
+# 0  disable multi-threading on cDSP side
+# 1  disable multi-threading on cDSP side
+# 2  setting thread_counts to 2 on cDSP side
+# 3  setting thread_counts to 3 on cDSP side
+# 4  setting thread_counts to 4 on cDSP side
+thread_counts = 1

From 8b9375ddb3aaaf7bc862b87886bf358cf7b847d3 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 9 Apr 2025 09:17:33 +0800
Subject: [PATCH 175/200] ggml-hexagon: upgrade QNN SDK to v2.33.0.250327

---
 ggml/src/ggml-hexagon/CMakeLists.txt   |  5 +++--
 ggml/src/ggml-hexagon/ggml-hexagon.cpp |  2 +-
 scripts/build-run-android.sh           | 10 +++++++---
 scripts/ggml-hexagon.cfg               |  2 +-
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index 361b444932801..a43470b33231b 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -95,8 +95,9 @@ function(ggml_hexagon_build_kernel KNAME)
         COMMAND echo "current working path:`pwd`\n"
         COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -c ${HEXAGON_KERNELS_PATH}/ggml-dsp.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic ${DEBUG_FLAG} -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -fno-finite-math-only -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/posix -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/qurt/
         COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o -c ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -fno-finite-math-only -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc
-        COMMAND ${HEXAGON_CC} -m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${HEXAGON_TARGET} -o ${HEXAGON_KERNELS_PATH}/${HEXAGON_TARGET} -Wl,--start-group ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -Wl,--end-group
-        COMMAND ls -l ${HEXAGON_KERNELS_PATH}/${HEXAGON_TARGET}
+        COMMAND ${HEXAGON_CC} -m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${HEXAGON_TARGET} -o ../../../bin/${HEXAGON_TARGET} -Wl,--start-group ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -Wl,--end-group
+        COMMAND ls -l ../../../bin/${HEXAGON_TARGET}
+        COMMAND /bin/cp -fv ../../../bin/${HEXAGON_TARGET} ../../../bin/libggmlop_skel.so
         COMMENT "build hexagon-kernel"
     )
 endfunction()
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index e9be4b4fec250..c1e8f5207ba50 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -359,7 +359,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
 #elif defined(_WIN32)
         .qnn_runtimelib_path    = "C:\\",
 #endif
-        .ggml_hexagon_version   = {"1.81"},
+        .ggml_hexagon_version   = {"1.03"},
         .ggml_dsp_version       = {"0.61"},
 };
 
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 7515894204ca1..b686c4abf321f 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -16,6 +16,7 @@ GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
 QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
 QNN_SDK_INSTALL_PATH=/opt/qcom/aistack/qairt/
 QNN_SDK_VERSION=2.32.0.250228
+QNN_SDK_VERSION=2.33.0.250327
 QNN_SDK_PATH=${QNN_SDK_INSTALL_PATH}/${QNN_SDK_VERSION}
 
 #5.5.3.0 should be also ok
@@ -27,6 +28,10 @@ HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
 #v75 --- Snapdragon 8 Gen3
 #v79 --- Snapdragon 8 Elite(aka Gen4)
 HTP_ARCH_VERSION=v75
+HTP_ARCH_VERSION_a=V75
+
+HTP_ARCH_VERSION=v79
+HTP_ARCH_VERSION_a=V79
 
 #running_params=" -mg 2 -ngl 99 "
 #running_params=" -mg 2 -ngl 99 -t 8 -fa 1 "
@@ -176,8 +181,8 @@ function update_qnn_libs()
     adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp.so                 ${REMOTE_PATH}/
     adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpNetRunExtensions.so ${REMOTE_PATH}/
     adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpPrepare.so          ${REMOTE_PATH}/
-    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtpV75Stub.so          ${REMOTE_PATH}/
-    adb push ${QNN_SDK_PATH}/lib/hexagon-v75/unsigned/libQnnHtpV75Skel.so     ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/aarch64-android/libQnnHtp${HTP_ARCH_VERSION_a}Stub.so          ${REMOTE_PATH}/
+    adb push ${QNN_SDK_PATH}/lib/hexagon-${HTP_ARCH_VERSION}/unsigned/libQnnHtp${HTP_ARCH_VERSION_a}Skel.so     ${REMOTE_PATH}/
 }
 
 
@@ -225,7 +230,6 @@ function prepare_run_on_phone()
     fi
     adb push ./out/android/bin/${program} ${REMOTE_PATH}/
     adb shell chmod +x ${REMOTE_PATH}/${program}
-    adb push ggml/src/ggml-hexagon/kernels/libggmlop_skel${HTP_ARCH_VERSION}.so  ${REMOTE_PATH}/libggmlop_skel.so
 }
 
 function run_llamacli()
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index b5afc14c355cc..cf9995e7d796e 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -23,7 +23,7 @@
 #
 [general]
 #version of ggml-hexagon.cpp on ARM-AP side
-version = "1.81"
+version = "1.03"
 #version of ggml-dsp.c on cDSP side
 ggmldsp_version = "0.61"
 

From a339a4df5bfc2754d5ce1d21b555c400209654dd Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 9 Apr 2025 19:06:42 +0800
Subject: [PATCH 176/200] ggml-hexagon: fix typo in ggml-hexagon.cpp

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index c1e8f5207ba50..5112a13661d54 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1002,7 +1002,6 @@ class hexagon_profiler {
 
         if (0 == elapse_time) {
             //filter invalid profiler data
-            GGMLHEXAGON_LOG_DEBUG("_enable_profiler %d", _enable_profiler);
             return;
         }
 
@@ -1197,10 +1196,10 @@ class hexagon_perf {
 
         _end_time = ggml_time_us();
         _duration = (_end_time - _begin_time);
-        //I think add following judgement will useful for other developers and AI experts although:
+        //add following judgement will useful for other developers and AI experts although:
         // it breaks the original logic
         // it's not mandatory
-        // I had to expose two public function in hexagon_profiler class
+        // had to expose two public function in hexagon_profiler class
         if (g_hexagon_profiler.profiler_get_frame_index() <= g_hexagon_profiler.profiler_get_threshold_count()) {
             GGMLHEXAGON_LOG_VERBOSE("inference duration of %s through %s: %lld microseconds",
                                     _perf_name.c_str(), ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach), _duration);
@@ -1774,7 +1773,7 @@ static void ggmlhexagon_load_cfg() {
     });
     std::string precision_mode;
     std::string ggml_hexagon_version;
-    hexagoncfg_instance.get_stringvalue("general", "ggml_hexagon_version", ggml_hexagon_version, "1.00");
+    hexagoncfg_instance.get_stringvalue("general", "ggml_hexagon_version", ggml_hexagon_version, "1.03");
     hexagoncfg_instance.get_intvalue("general", "enable_perf", g_hexagon_appcfg.enable_perf, 1);
     hexagoncfg_instance.get_intvalue("general", "print_tensors_info", g_hexagon_appcfg.print_tensors_info, 0);
     hexagoncfg_instance.get_intvalue("general", "dump_op_info", g_hexagon_appcfg.dump_op_info, 0);
@@ -1888,7 +1887,7 @@ static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * c
 // =================================================================================================
 //  section-5: QNN helper function/class
 // =================================================================================================
-//ensure every QNN tensor/opcfg name is unique, threadsafe is not required at the moment
+//make sure every QNN tensor/opcfg name is unique, threadsafe is not required at the moment
 static void ggmlqnn_reset_idx() {
     g_qnntensor_idx = 0;
     g_qnnopcfg_idx = 0;
@@ -5341,7 +5340,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
         goto bail;
     }
 
-    //ensure test-backend-ops get the correct backend name when hwaccel approach is 2(HWACCEL_CDSP)
+    //make sure test-backend-ops get the correct backend name when hwaccel approach is 2(HWACCEL_CDSP)
     memcpy(g_hexagon_mgr[ctx->device].name, "Hexagon-cDSP", strlen("Hexagon-cDSP"));
 
     return 0;
@@ -5740,7 +5739,7 @@ struct ggml_backend_hexagon_buffer_context {
     ~ggml_backend_hexagon_buffer_context() {
         if (buffer) {
             if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
-                //do nonthing here because rpc mempool was used for HWACCEL_CDSP
+                //do nothing here because rpc mempool was used for HWACCEL_CDSP
             } else {
                 ggml_aligned_free(buffer, 0);
             }
@@ -6272,6 +6271,9 @@ static void ggml_backend_hexagon_set_n_threads(ggml_backend_t backend, int n_thr
 int ggml_backend_hexagon_get_device_count() {
     if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
         GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP);
+        //here is the trick:
+        //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
+        //so return 1
         return 1;
     } else {
         return GGML_HEXAGON_MAX_DEVICES;
@@ -6516,7 +6518,7 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * runtime_lib
             return nullptr;
         }
     } else {
-        //got fully description of SoC when hwaccel approach is HWACCEL_QNN and backend is HEXAGON_BACKEND_QNNNPU
+        //get fully description of SoC when hwaccel approach is HWACCEL_QNN and backend is HEXAGON_BACKEND_QNNNPU
         GGMLHEXAGON_LOG_INFO("device name %s", ggml_backend_hexagon_device_get_description(hexagon_backend->device));
     }
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__);

From 6b47f081687e662696b1eac2f29626fedd8cc2f4 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 9 Apr 2025 22:30:12 +0800
Subject: [PATCH 177/200] ggml-dsp: probe QuRT RTOS information in function
 ggmlop_dsp_open

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp   |  2 ++
 ggml/src/ggml-hexagon/kernels/ggml-dsp.c | 15 +++++++++++++++
 scripts/ggml-hexagon.cfg                 | 10 ++++------
 3 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 5112a13661d54..f31d0634fffc4 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1872,8 +1872,10 @@ static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * c
     if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
         GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
         GGMLHEXAGON_LOG_INFO("using rpc ion memory pool:        %s", g_hexagon_appcfg.enable_rpc_ion_mempool ? "YES" : "NO");
+        GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_CDSP: %d", g_hexagon_appcfg.thread_counts);
         ggmlhexagon_probe_dspinfo(ctx);
     } else {
+        GGMLHEXAGON_LOG_INFO("thread_counts with HWACCEL_QNN: %d", g_hexagon_appcfg.hvx_threads);
         GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
     }
     GGMLHEXAGON_LOG_INFO("running timestamp:%s", timestamp);
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
index 5eaf714df3cbc..3c1c75b27f749 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -32,6 +32,8 @@
 #include "HAP_vtcm_mgr.h"
 #include "HAP_compute_res.h"
 
+#include "qurt.h"
+
 #include "AEEStdErr.h"
 #include "hexagon_types.h"
 #include "hexagon_protos.h"
@@ -886,6 +888,19 @@ int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
 
     ggml_init();
 
+    unsigned int api_version = qurt_api_version();
+    GGMLHEXAGON_LOG_DEBUG("api_version = 0x%x", api_version);
+    GGMLHEXAGON_LOG_DEBUG("hvx units = 0x%d", qurt_hvx_get_units());
+    qurt_arch_version_t  vers;
+    qurt_sysenv_get_arch_version(&vers);
+    GGMLHEXAGON_LOG_DEBUG("arch_version=0x%x", vers.arch_version);
+    qurt_sysenv_app_heap_t aheap;
+    qurt_sysenv_get_app_heap(&aheap);
+    GGMLHEXAGON_LOG_DEBUG("aheap.heap_base=0x%x, aheap.heap_limit=0x%x", aheap.heap_base, aheap.heap_limit);
+    qurt_sysenv_max_hthreads_t mhwt;
+    qurt_sysenv_get_max_hw_threads(&mhwt);
+    GGMLHEXAGON_LOG_DEBUG("max hardware threads=%d", mhwt.max_hthreads);
+
     return 0;
 }
 
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index cf9995e7d796e..545273dd6b615 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -85,7 +85,7 @@ profiler_counts = 200
 # enable/disable QNN SDK's internal log, this will very helpful for troubleshooting in HWACCEL_QNN approach
 print_qnn_internal_log = 0
 
-hvx_threads = 4
+hvx_threads = 8
 vtcm_size_in_mb = 8
 enable_dlbc = 1
 precision_mode = "fp16"
@@ -102,9 +102,7 @@ enable_all_q_mulmat = 0
 #ensure enable_q_mulmat = 1 when set enable_all_q_mulmat = 1
 
 #enable/disable multi-threading on cDSP side
-# 0  disable multi-threading on cDSP side
-# 1  disable multi-threading on cDSP side
-# 2  setting thread_counts to 2 on cDSP side
-# 3  setting thread_counts to 3 on cDSP side
-# 4  setting thread_counts to 4 on cDSP side
+# 0    disable multi-threading on cDSP side
+# 1    disable multi-threading on cDSP side
+# 2-8  thread_counts on cDSP side
 thread_counts = 1

From 3b5f1722d1081570da06f257d1d9e4c1c2111963 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 10 Apr 2025 10:28:26 +0800
Subject: [PATCH 178/200] ggml-hexagon: setting enable_rpc_ion_mempool to 1 and
 make test-backend-ops happy

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp   | 7 +++----
 ggml/src/ggml-hexagon/kernels/ggml-dsp.c | 3 +--
 scripts/ggml-hexagon.cfg                 | 8 ++++----
 3 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index f31d0634fffc4..2a31c319d0ca3 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -359,7 +359,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
 #elif defined(_WIN32)
         .qnn_runtimelib_path    = "C:\\",
 #endif
-        .ggml_hexagon_version   = {"1.03"},
+        .ggml_hexagon_version   = {"1.04"},
         .ggml_dsp_version       = {"0.61"},
 };
 
@@ -5107,9 +5107,8 @@ static int ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
         GGML_ASSERT(ctx->rpc_mempool_capacity > (8 * SIZE_IN_MB));
         ctx->rpc_mempool_len = ctx->rpc_mempool_capacity - (8 * SIZE_IN_MB);
 
-        //FIXME: it seems there is unknown issue with another ION memory pool
-        ctx->rpc_mempool = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS,
-                                        ctx->rpc_mempool_len);
+        //FIXME: it seems there is unknown issue with 2+ GiB memory pool
+        ctx->rpc_mempool = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_TRY_MAP_STATIC, ctx->rpc_mempool_len);
         if (nullptr == ctx->rpc_mempool) {
             GGMLHEXAGON_LOG_WARN("alloc rpc memorypool %d failed", ctx->rpc_mempool_len);
             return 2;
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
index 3c1c75b27f749..e79341ed27569 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -888,8 +888,7 @@ int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
 
     ggml_init();
 
-    unsigned int api_version = qurt_api_version();
-    GGMLHEXAGON_LOG_DEBUG("api_version = 0x%x", api_version);
+    GGMLHEXAGON_LOG_DEBUG("api_version = 0x%x", qurt_api_version());
     GGMLHEXAGON_LOG_DEBUG("hvx units = 0x%d", qurt_hvx_get_units());
     qurt_arch_version_t  vers;
     qurt_sysenv_get_arch_version(&vers);
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index 545273dd6b615..6e5a37e03c520 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -23,7 +23,7 @@
 #
 [general]
 #version of ggml-hexagon.cpp on ARM-AP side
-version = "1.03"
+version = "1.04"
 #version of ggml-dsp.c on cDSP side
 ggmldsp_version = "0.61"
 
@@ -69,8 +69,8 @@ enable_perf = 1
 
 
 # enablie/disable profiler feature to visually compare NPU performance between HWACCEL_CDSP and HWACCEL_QNN
-# this is not default setting but useful before PR in upstream can be approved
-enable_profiler = 1
+# this is default setting
+enable_profiler = 0
 #threshold duration of NPU performance profiler, per seconds
 profiler_duration = 5
 #threshold counst of NPU performance profiler
@@ -94,7 +94,7 @@ precision_mode = "fp16"
 #hwaccel approach through cDSP(offload ggml op to Hexagon cDSP directly)
 [cdsp]
 #enable/disable rpc ion memory pool
-enable_rpc_ion_mempool = 0
+enable_rpc_ion_mempool = 1
 
 #enable/disable offload all quantized type mulmat to cDSP
 enable_all_q_mulmat = 0

From 68e325b86ab4f99bafba75243a98a5fc7fecaadb Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 10 Apr 2025 20:45:35 +0800
Subject: [PATCH 179/200] ggml-hexagon: check whether user's specified htp arch
 is valid in CMakeLists.txt

---
 ggml/src/ggml-hexagon/CMakeLists.txt | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index a43470b33231b..80186509f76b4 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -32,8 +32,18 @@ endif()
 #v75 --- Snapdragon 8 Gen3
 #v79 --- Snapdragon 8 Elite(aka Gen4)
 if(NOT DEFINED HTP_ARCH_VERSION)
-    #set default HTP_ARCH_VERSION to v75
-    set(HTP_ARCH_VERSION v75)
+    message(FATAL_ERROR "HTP_ARCH_VERSION not defined, valid htp arch: v68,v69,v73,v75,v79")
+endif()
+
+#check whether user's specified htp arch is valid
+set(CHECK_HTP_ARCH "WRONG")
+foreach (feat v68 v69 v73 v75 v79)
+    if (${feat} STREQUAL ${HTP_ARCH_VERSION})
+        set(CHECK_HTP_ARCH "GOOD")
+    endif()
+endforeach()
+if (${CHECK_HTP_ARCH} STREQUAL "WRONG")
+    message(FATAL_ERROR "ggml-hexagon backend only support htp arch v68,v69,v73,v75,v79")
 endif()
 
 #cross compiling for hexagon kernels on cDSP side
@@ -71,14 +81,14 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Android")
 elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows")
     set(QNN_DEFAULT_LIB_SEARCH_PATH "C:\\" CACHE STRING "customized library search path for QNN backend")
 else()
-    message(FATAL_ERROR "QNN now only available on Android and Windows(Windows on ARM)")
+    message(FATAL_ERROR "ggml-hexagon now only available on Android and Windows(Windows on ARM)")
 endif()
 
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG}")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
 
-file(GLOB QNN_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/ggmlop_ap_skel.c")
-ggml_add_backend_library(ggml-hexagon ${QNN_SOURCES})
+file(GLOB HEXAGON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/ggmlop_ap_skel.c")
+ggml_add_backend_library(ggml-hexagon ${HEXAGON_SOURCES})
 
 target_include_directories(ggml-hexagon PRIVATE ${QNN_SDK_PATH}/include/QNN ${HEXAGON_SDK_PATH} ${CMAKE_CURRENT_LIST_DIR})
 target_link_libraries(ggml-hexagon PRIVATE ${QNN_LINK_LIBRARIES})
@@ -87,7 +97,7 @@ string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEAR
 target_compile_definitions(ggml-hexagon PRIVATE QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/")
 
 function(ggml_hexagon_build_kernel KNAME)
-    message(STATUS "ggml_hexagon: build kernel ${KNAME}")
+    message(STATUS "ggml_hexagon: build hexagon-kernel ${KNAME}")
 
     add_custom_command(
         TARGET ${PROJECT_NAME}

From 7f24cd6a21c0677105beed6585aa5f7465b4ba48 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Fri, 11 Apr 2025 13:04:00 +0800
Subject: [PATCH 180/200] ggml-hexagon: sync with upstream

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 2a31c319d0ca3..7ddda008da192 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -573,14 +573,10 @@ static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = {
         {false, GGML_OP_GATED_LINEAR_ATTN, 0, nullptr},
         {false, GGML_OP_RWKV_WKV7, 0, nullptr},
         {false, GGML_OP_UNARY, 0, nullptr},
-        {false, GGML_OP_MAP_UNARY, 0, nullptr},
-        {false, GGML_OP_MAP_BINARY, 0, nullptr},
-        {false, GGML_OP_MAP_CUSTOM1_F32, 0, nullptr},
-        {false, GGML_OP_MAP_CUSTOM2_F32, 0, nullptr},
-        {false, GGML_OP_MAP_CUSTOM3_F32, 0, nullptr},
         {false, GGML_OP_MAP_CUSTOM1, 0, nullptr},
         {false, GGML_OP_MAP_CUSTOM2, 0, nullptr},
         {false, GGML_OP_MAP_CUSTOM3, 0, nullptr},
+        {false, GGML_OP_CUSTOM, 0, nullptr},
         {false, GGML_OP_CROSS_ENTROPY_LOSS, 0, nullptr},
         {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK, 0, nullptr},
         {false, GGML_OP_OPT_STEP_ADAMW, 0, nullptr},
@@ -683,14 +679,10 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
         {false, GGML_OP_GATED_LINEAR_ATTN, 0, nullptr, nullptr},
         {false, GGML_OP_RWKV_WKV7, 0, nullptr, nullptr},
         {false, GGML_OP_UNARY, 0, nullptr, nullptr},
-        {false, GGML_OP_MAP_UNARY, 0, nullptr, nullptr},
-        {false, GGML_OP_MAP_BINARY, 0, nullptr, nullptr},
-        {false, GGML_OP_MAP_CUSTOM1_F32, 0, nullptr, nullptr},
-        {false, GGML_OP_MAP_CUSTOM2_F32, 0, nullptr, nullptr},
-        {false, GGML_OP_MAP_CUSTOM3_F32, 0, nullptr, nullptr},
         {false, GGML_OP_MAP_CUSTOM1, 0, nullptr, nullptr},
         {false, GGML_OP_MAP_CUSTOM2, 0, nullptr, nullptr},
         {false, GGML_OP_MAP_CUSTOM3, 0, nullptr, nullptr},
+        {false, GGML_OP_CUSTOM, 0, nullptr, nullptr},
         {false, GGML_OP_CROSS_ENTROPY_LOSS, 0, nullptr, nullptr},
         {false, GGML_OP_CROSS_ENTROPY_LOSS_BACK, 0, nullptr, nullptr},
         {false, GGML_OP_OPT_STEP_ADAMW, 0, nullptr, nullptr},

From cb0dfd75b04497ca2773ad7180d3d5659d1360bc Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Fri, 11 Apr 2025 13:43:17 +0800
Subject: [PATCH 181/200] ggml-hexagon: refine pinned-memory feature

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 52 ++++++++++++++++++++++++--
 scripts/ggml-hexagon.cfg               |  3 ++
 2 files changed, 51 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 7ddda008da192..8b8286bc6f58b 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -255,6 +255,28 @@ enum qcom_chipset_soc_model {
 #endif
 };
 
+//borrowed from Android source code, might not be accurate
+enum ion_heap_ids {
+    INVALID_HEAP_ID             = -1,
+    ION_CP_MM_HEAP_ID           = 8,
+    ION_SECURE_HEAP_ID          = 9,
+    ION_SECURE_DISPLAY_HEAP_ID  = 10,
+    ION_CP_MFC_HEAP_ID          = 12,
+    ION_SPSS_HEAP_ID            = 13,
+    ION_CP_WB_HEAP_ID           = 16,
+    ION_CAMERA_HEAP_ID          = 20,
+    ION_SYSTEM_CONTIG_HEAP_ID   = 21,
+    ION_ADSP_HEAP_ID            = 22,
+    ION_PIL1_HEAP_ID            = 23,
+    ION_SF_HEAP_ID              = 24,
+    ION_SYSTEM_HEAP_ID          = 25,
+    ION_PIL2_HEAP_ID            = 26,
+    ION_QSECOM_HEAP_ID          = 27,
+    ION_AUDIO_HEAP_ID           = 28,
+    ION_MM_FIRMWARE_HEAP_ID     = 29,
+    ION_HEAP_ID_RESERVED        = 31
+};
+
 struct qcom_socinfo {
     uint32_t soc_model;
     size_t htp_arch;
@@ -315,6 +337,7 @@ struct hexagon_appcfg_t {
     int print_tensors_info;     // enable/disable print tensors info in op function
     int dump_op_info;           // enable/disable dump op info in handle_op
     int enable_q_mulmat;        // enable/disable offload quantized mulmat
+    int enable_pinned_memory;   // enable/disable pinned-memory feature
     int precision_mode;         // 0: default 1:fp16
     int hvx_threads;
     int vtcm_size_in_mb;
@@ -339,6 +362,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
         .print_tensors_info     = 0,
         .dump_op_info           = 0,
         .enable_q_mulmat        = 0,
+        .enable_pinned_memory   = 0,
         .precision_mode         = 0,
         .hvx_threads            = 4,
         .vtcm_size_in_mb        = 8,
@@ -1775,6 +1799,7 @@ static void ggmlhexagon_load_cfg() {
     hexagoncfg_instance.get_intvalue("general", "enable_profiler", g_hexagon_appcfg.enable_profiler, 0);
     hexagoncfg_instance.get_intvalue("general", "profiler_duration", g_hexagon_appcfg.profiler_duration, 5);
     hexagoncfg_instance.get_intvalue("general", "profiler_counts", g_hexagon_appcfg.profiler_counts, 100);
+    hexagoncfg_instance.get_intvalue("general", "enable_pinned_memory", g_hexagon_appcfg.enable_pinned_memory, 0);
 
     hexagoncfg_instance.get_intvalue("qnn", "hvx_threads", g_hexagon_appcfg.hvx_threads, 4);
     hexagoncfg_instance.get_intvalue("qnn", "vtcm_size_in_mb", g_hexagon_appcfg.vtcm_size_in_mb, 8);
@@ -1860,6 +1885,7 @@ static void ggmlhexagon_print_running_timestamp(ggml_backend_hexagon_context * c
                          ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
     GGMLHEXAGON_LOG_INFO("hexagon_backend:                  %d(%s)", g_hexagon_appcfg.hexagon_backend,
                          ggml_backend_hexagon_get_devname(g_hexagon_appcfg.hexagon_backend));
+    GGMLHEXAGON_LOG_INFO("enable pinned_memory:             %s", g_hexagon_appcfg.enable_pinned_memory ? "YES" : "NO");
     ggmlhexagon_get_timestring(timestamp);
     if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
         GGMLHEXAGON_LOG_INFO("offload quantize GGML_OP_MUL_MAT: %s", g_hexagon_appcfg.enable_q_mulmat ? "YES" : "NO");
@@ -6139,18 +6165,33 @@ static const char * ggml_backend_hexagon_host_buffer_name(ggml_backend_buffer_t
 }
 
 static void ggml_backend_hexagon_host_buffer_free(ggml_backend_buffer_t buffer) {
-    ggml_aligned_free(buffer->context, 0);
+    if (0 == g_hexagon_appcfg.enable_pinned_memory) {
+        ggml_aligned_free(buffer->context, 0);
+    } else {
+        rpcmem_free(buffer->context);
+    }
 }
 
 static void * ggml_hexagon_host_malloc(ggml_backend_buffer_type_t buft, size_t size) {
-    return ggml_aligned_malloc(size);
+    if (0 == g_hexagon_appcfg.enable_pinned_memory) {
+        return ggml_aligned_malloc(size);
+    } else {
+        //TODO: there are no corresponding APIs in existing Hexagon SDK, here try to re-use camera ion heap as a pinned memory
+        return rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, ION_CAMERA_HEAP_ID | RPCMEM_TRY_MAP_STATIC, size);
+    }
 }
 
 static ggml_backend_buffer_t ggml_backend_hexagon_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     void * host_ptr = ggml_hexagon_host_malloc(buft, size);
 
     if (nullptr == host_ptr) {
+        GGMLHEXAGON_LOG_INFO("failed to alloc host buffer");
+        //TODO: use assertion here before find a better approach to release "correct" host buffer
+        //      in function ggml_backend_hexagon_host_buffer_free
+        GGML_ASSERT(nullptr != host_ptr);
         return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
+    } else {
+        GGMLHEXAGON_LOG_INFO("succeed to alloc host buffer %d MiB", size / SIZE_IN_MB);
     }
 
     ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(host_ptr, size);
@@ -6356,9 +6397,12 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
                 } else {
                     ggml_backend_hexagon_device_interface.supports_op = ggmlhexagon_can_handle_op_through_qnn;
                 }
+
                 if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
-                    //don't use system memory in this scenario
-                    ggml_backend_hexagon_device_interface.get_host_buffer_type = nullptr;
+                    if (0 == g_hexagon_appcfg.enable_pinned_memory) {
+                        //don't use system memory in this scenario
+                        ggml_backend_hexagon_device_interface.get_host_buffer_type = nullptr;
+                    }
                 }
 
                 GGMLHEXAGON_LOG_DEBUG("create backend device for device %d", i);
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index 6e5a37e03c520..cf5d1f49b2ece 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -80,6 +80,9 @@ profiler_counts = 200
 #          ensure enable_perf = 1 when set enable_profiler = 1;
 
 
+#enable/disable pinned-memory feature
+enable_pinned_memory = 0
+
 #hwaccel approach through QNN(offload ggml op to QNN-NPU)
 [qnn]
 # enable/disable QNN SDK's internal log, this will very helpful for troubleshooting in HWACCEL_QNN approach

From 88acf6dff5c0b5a26f370533a7754182ce091b04 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Fri, 11 Apr 2025 16:14:59 +0800
Subject: [PATCH 182/200] ggml-hexagon: refine build system in ggml-hexagon

---
 CMakeLists.txt                       | 16 ++++++++--------
 ggml/src/ggml-hexagon/CMakeLists.txt | 16 +++++++++++++---
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c5903c112b944..bb6f4b785332e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,14 +7,14 @@ set(CMAKE_WARN_UNUSED_CLI YES)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-if(CMAKE_SYSTEM_NAME STREQUAL "Android")
-    set(TARGET_SNAPDRAGON8GEN3    ON)
-    if(TARGET_SNAPDRAGON8GEN3)
-       #works fine on Snapdragon 8Gen3 with 1.5x(45+ tokens/second)-3x(70+ tokens/second) performance gain through the default ggml backend
-       add_definitions(-march=armv8.7-a)
-       add_definitions(-mcpu=cortex-x1)
-       add_definitions(-mtune=cortex-x1)
-   endif()
+if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
+    #works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend
+    set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -fvectorize -ffp-model=fast -fno-finite-math-only")
+    message("OPT_FLAG:${OPT_FLAG}")
+    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
 endif()
 
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index 80186509f76b4..b1f6c7703571d 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -25,7 +25,6 @@ else()
     message("Release mode:${DEBUG_FLAG}")
 endif()
 
-
 #v68 --- Snapdragon 888
 #v69 --- Snapdragon 8 Gen1
 #v73 --- Snapdragon 8 Gen2
@@ -46,6 +45,14 @@ if (${CHECK_HTP_ARCH} STREQUAL "WRONG")
     message(FATAL_ERROR "ggml-hexagon backend only support htp arch v68,v69,v73,v75,v79")
 endif()
 
+#check optimization flags
+set(OPT_FLAG " ")
+if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
+    #works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend
+    set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -fvectorize -fno-finite-math-only -ffp-model=fast ")
+endif()
+message("OPT_FLAG:${OPT_FLAG}")
+
 #cross compiling for hexagon kernels on cDSP side
 set(HEXAGON_CC              "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang")
 set(HEXAGON_CXX             "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang")
@@ -84,8 +91,11 @@ else()
     message(FATAL_ERROR "ggml-hexagon now only available on Android and Windows(Windows on ARM)")
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG}")
-set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
+
+set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
 
 file(GLOB HEXAGON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/ggmlop_ap_skel.c")
 ggml_add_backend_library(ggml-hexagon ${HEXAGON_SOURCES})

From 67c7d0642fac44646b92dbe364f4bd1b051974ed Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Fri, 11 Apr 2025 17:53:03 +0800
Subject: [PATCH 183/200] ggml-hexagon: remove redundant code in struct
 ggml_backend_hexagon_buffer_context

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 28 +++++++++++---------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 8b8286bc6f58b..46a2790ff28a4 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -901,7 +901,6 @@ class hexagon_profiler {
         _profiler_threshold_duration = profiler_threshold_duration;
         _profiler_threshold_counts   = profiler_threshold_counts;
 
-        //FIXME:hardcode filename of profiler data
         std::string filename = std::string(g_hexagon_appcfg.runtime_libpath) + "/";
         if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
             if (g_hexagon_appcfg.thread_counts > 1) {
@@ -5763,19 +5762,12 @@ struct ggml_backend_hexagon_buffer_context {
                 ggml_aligned_free(buffer, 0);
             }
         }
-
-        for (auto * sub_buffer : sub_buffers) {
-            free(sub_buffer);
-        }
-
-        sub_buffers.clear();
     }
+
     void * buffer       = nullptr;
+    size_t buffer_size  = 0;
 
     struct ggml_backend_hexagon_context * backend_ctx = nullptr;
-
-    size_t buffer_size  = 0;
-    std::vector<void *> sub_buffers;
 };
 
 static void ggml_backend_hexagon_buffer_free_buffer(ggml_backend_buffer_t buffer) {
@@ -5895,9 +5887,17 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
     return ggml_backend_buffer_init(buft, ggml_backend_hexagon_buffer_interface, buffer_ctx, size);
 }
 
+/**
+ * @param buft   pointer to the buffer type context
+ * @return       alignment requirement in bytes
+ */
 static size_t ggml_backend_hexagon_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     GGML_UNUSED(buft);
-    return 32;
+    if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        return 128;
+    } else {
+        return 32;
+    }
 }
 
 static size_t ggml_backend_hexagon_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
@@ -5919,11 +5919,7 @@ static bool ggml_backend_buft_is_hexagon(ggml_backend_buffer_type_t buft) {
 static bool ggml_backend_hexagon_buffer_is_host(ggml_backend_buffer_type_t buft) {
     struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(buft->context);
     GGML_ASSERT(nullptr != ctx);
-    if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
-        //FIXME: return false here is make sense in this scenario although this is not key-point at the moment
-        //       fix it after solving other urgent tasks
-        //return false;
-    }
+    GGML_UNUSED(ctx);
     return true;
 }
 

From 36c3ff6bf892ca90e4bc5851b7b100d798e7b0cc Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Fri, 11 Apr 2025 18:12:07 +0800
Subject: [PATCH 184/200] ggml-hexagon: upgrade Android NDK to android-ndk-r28

---
 CMakeLists.txt                         |  2 +-
 ggml/src/ggml-hexagon/CMakeLists.txt   |  2 +-
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 64 ++++++++++++++++++++------
 scripts/build-run-android.sh           | 36 ++++++++++-----
 4 files changed, 75 insertions(+), 29 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bb6f4b785332e..35d955065ce53 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,7 +9,7 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
 if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
     #works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend
-    set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -fvectorize -ffp-model=fast -fno-finite-math-only")
+    set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -flto -D_GNU_SOURCE -fvectorize -ffp-model=fast -fno-finite-math-only")
     message("OPT_FLAG:${OPT_FLAG}")
     set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index b1f6c7703571d..1dd6cb46a7aa6 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -49,7 +49,7 @@ endif()
 set(OPT_FLAG " ")
 if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
     #works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend
-    set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -fvectorize -fno-finite-math-only -ffp-model=fast ")
+    set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -flto -D_GNU_SOURCE -fvectorize -fno-finite-math-only -ffp-model=fast ")
 endif()
 message("OPT_FLAG:${OPT_FLAG}")
 
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 46a2790ff28a4..070450a0de7d7 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -482,7 +482,21 @@ static struct ggml_backend_hexagon_context g_hexagon_mgr[GGML_HEXAGON_MAX_DEVICE
                 .backend              = nullptr,
                 .raw_interface        = {},
                 .raw_system_interface = {},
-                .socinfo              = {}},
+                .socinfo              = {},
+                .qnn_singlenode_graph_map = {},
+                .work_data            = nullptr,
+                .tasks                = {},
+                .work_size            = 0,
+                .desired_size         = 0,
+                .n_threads            = 8,
+                .rpc_mempool_capacity = 0,
+                .rpc_mempool_len      = 0,
+                .rpc_mempool_usage    = 0,
+                .rpc_mempool          = nullptr,
+                .rpc_mempool_handle   = 0,
+                .ggmlop_handle        = 0,
+                .domain_id            = HEXAGON_CDSP,
+        },
 
         {       .device               = 1,
                 .name                 = "qnn-gpu",
@@ -496,7 +510,21 @@ static struct ggml_backend_hexagon_context g_hexagon_mgr[GGML_HEXAGON_MAX_DEVICE
                 .backend              = nullptr,
                 .raw_interface        = {},
                 .raw_system_interface = {},
-                .socinfo              = {}},
+                .socinfo              = {},
+                .qnn_singlenode_graph_map = {},
+                .work_data            = nullptr,
+                .tasks                = {},
+                .work_size            = 0,
+                .desired_size         = 0,
+                .n_threads            = 8,
+                .rpc_mempool_capacity = 0,
+                .rpc_mempool_len      = 0,
+                .rpc_mempool_usage    = 0,
+                .rpc_mempool          = nullptr,
+                .rpc_mempool_handle   = 0,
+                .ggmlop_handle        = 0,
+                .domain_id            = HEXAGON_CDSP,
+        },
 
         {       .device               = 2,
                 .name                 = "qnn-npu",
@@ -510,7 +538,21 @@ static struct ggml_backend_hexagon_context g_hexagon_mgr[GGML_HEXAGON_MAX_DEVICE
                 .backend              = nullptr,
                 .raw_interface        = {},
                 .raw_system_interface = {},
-                .socinfo              = {}},
+                .socinfo              = {},
+                .qnn_singlenode_graph_map = {},
+                .work_data            = nullptr,
+                .tasks                = {},
+                .work_size            = 0,
+                .desired_size         = 0,
+                .n_threads            = 8,
+                .rpc_mempool_capacity = 0,
+                .rpc_mempool_len      = 0,
+                .rpc_mempool_usage    = 0,
+                .rpc_mempool          = nullptr,
+                .rpc_mempool_handle   = 0,
+                .ggmlop_handle        = 0,
+                .domain_id            = HEXAGON_CDSP,
+         },
 };
 
 static domain hexagon_supported_domains[] = {
@@ -3857,7 +3899,9 @@ static Qnn_Tensor_t * ggmlqnn_create_general_tensor(qnn_instance * instance, Qnn
                     .dataFormat = QNN_TENSOR_DATA_FORMAT_FLAT_BUFFER,
                     .dataType = qnn_data_type,
                     .quantizeParams = {.encodingDefinition = QNN_DEFINITION_UNDEFINED,
-                            .quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED},
+                            .quantizationEncoding = QNN_QUANTIZATION_ENCODING_UNDEFINED,
+                            .scaleOffsetEncoding = {.scale = 0.0000000000000000f, .offset = 0}
+                            },
                     .rank = rank,
                     .dimensions = tensor_dims,
                     .memType = QNN_TENSORMEMTYPE_RAW,
@@ -4559,12 +4603,6 @@ static void ggmlqnn_compute_rms_norm(ggml_backend_hexagon_context * ctx, ggml_te
     GGML_UNUSED(dst);
 }
 
-static void ggmlqnn_compute_diag_mask(ggml_backend_hexagon_context * ctx, ggml_tensor * dst, float value) {
-    GGML_UNUSED(ctx);
-    GGML_UNUSED(dst);
-    GGML_UNUSED(value);
-}
-
 static void ggmlqnn_compute_im2col(ggml_backend_hexagon_context * ctx, ggml_tensor * dst) {
     GGML_UNUSED(ctx);
     GGML_UNUSED(dst);
@@ -5203,9 +5241,8 @@ static void ggmlhexagon_deinit_cdsp(ggml_backend_hexagon_context * ctx) {
         hexagon_error = ggmlop_dsp_close(ctx->ggmlop_handle);
         if (AEE_SUCCESS != hexagon_error) {
             GGMLHEXAGON_LOG_WARN("error 0x%x: failed to close ggmlop dsp handle", hexagon_error);
-        } else {
-            ctx->ggmlop_handle = 0;
         }
+        ctx->ggmlop_handle = 0;
     }
 
     ggmlhexagon_deinit_rpcmempool(ctx);
@@ -5722,9 +5759,6 @@ static bool ggmlhexagon_compute_forward(ggml_backend_t backend, struct ggml_tens
         case GGML_OP_PERMUTE:
         case GGML_OP_TRANSPOSE:
             break;
-        case GGML_OP_DIAG_MASK_INF:
-            ggmlqnn_compute_diag_mask(ctx, dst, -INFINITY);
-            break;
         case GGML_OP_SOFT_MAX:
             ggmlqnn_compute_softmax(ctx, dst);
             break;
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index b686c4abf321f..6a1c35cbdcc6d 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -1,25 +1,37 @@
 #!/bin/bash
-# build llama.cpp + ggml-hexagon for Snapdragon mobile SoC equipped Android phone on Linux
-
+# build llama.cpp + ggml-hexagon for Qualcomm Snapdragon mobile SoC equipped Android phone on Linux
+#
+# this script will download Android NDK and Qualcomm QNN SDK automatically,
+# Hexagon SDK must be obtained with a Qualcomm Developer Account and cannot be downloaded automatically in this script.
+#
 set -e
 
 PWD=`pwd`
-ANDROID_PLATFORM=android-34
-ANDROID_NDK=${PWD}/android-ndk-r26c
+
+#running path on Android phone
 REMOTE_PATH=/data/local/tmp/
+#LLM model file on Android phone
 GGUF_MODEL_NAME=/sdcard/gemma-3-4b-it-Q8_0.gguf
 GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
 
-#QNN SDK could be found at:
+#Android NDK can be found at:
+#https://developer.android.com/ndk/downloads
+ANDROID_PLATFORM=android-34
+ANDROID_NDK_VERSION=r28
+ANDROID_NDK_NAME=android-ndk-${ANDROID_NDK_VERSION}
+ANDROID_NDK_FULLNAME=${ANDROID_NDK_NAME}-linux.zip
+ANDROID_NDK=${PWD}/${ANDROID_NDK_NAME}
+
+#QNN SDK can be found at:
 #https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
-#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
 QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
 QNN_SDK_INSTALL_PATH=/opt/qcom/aistack/qairt/
 QNN_SDK_VERSION=2.32.0.250228
 QNN_SDK_VERSION=2.33.0.250327
 QNN_SDK_PATH=${QNN_SDK_INSTALL_PATH}/${QNN_SDK_VERSION}
 
-#5.5.3.0 should be also ok
+#Hexagon SDK can be found at:
+#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
 HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
 #available htp arch version:
 #v68 --- Snapdragon 888
@@ -27,13 +39,13 @@ HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
 #v73 --- Snapdragon 8 Gen2
 #v75 --- Snapdragon 8 Gen3
 #v79 --- Snapdragon 8 Elite(aka Gen4)
+#8Gen3
 HTP_ARCH_VERSION=v75
 HTP_ARCH_VERSION_a=V75
-
+#8Elite
 HTP_ARCH_VERSION=v79
 HTP_ARCH_VERSION_a=V79
 
-#running_params=" -mg 2 -ngl 99 "
 #running_params=" -mg 2 -ngl 99 -t 8 -fa 1 "
 running_params=" -mg 2 -ngl 99 -t 8 "
 
@@ -109,11 +121,11 @@ function check_and_download_ndk()
 
     if [ ${is_android_ndk_exist} -eq 0 ]; then
 
-        if [ ! -f android-ndk-r26c-linux.zip ]; then
-            wget --no-config --quiet --show-progress -O android-ndk-r26c-linux.zip  https://dl.google.com/android/repository/android-ndk-r26c-linux.zip
+        if [ ! -f ${ANDROID_NDK_FULLNAME} ]; then
+            wget --no-config --quiet --show-progress -O ${ANDROID_NDK_FULLNAME} https://dl.google.com/android/repository/${ANDROID_NDK_FULLNAME}
         fi
 
-        unzip android-ndk-r26c-linux.zip
+        unzip ${ANDROID_NDK_FULLNAME}
 
         if [ $? -ne 0 ]; then
             printf "failed to download android ndk to %s \n" "${ANDROID_NDK}"

From 57cfbbe8c0d9976f00a8cf2b6b7a32e917d08fc8 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sat, 12 Apr 2025 00:13:21 +0800
Subject: [PATCH 185/200] ggml-dsp: split ggml-dsp.c into multiple files and
 cleanup

---
 ggml/src/ggml-hexagon/CMakeLists.txt          |  22 +-
 ggml/src/ggml-hexagon/ggml-hexagon.cpp        |   6 +-
 ggml/src/ggml-hexagon/kernels/Makefile        |  39 ++
 ggml/src/ggml-hexagon/kernels/add.c           | 144 +++++
 ggml/src/ggml-hexagon/kernels/ggml-dsp.c      | 582 +++---------------
 ggml/src/ggml-hexagon/kernels/ggml-dsp.h      |  55 +-
 ggml/src/ggml-hexagon/kernels/mulmat.c        | 278 +++++++++
 .../kernels/{ggmlop_cdsp_skel.c => skel.c}    |   2 +-
 .../kernels/{ggmlop_ap_skel.h => skel.h}      |   6 +-
 .../{kernels/ggmlop_ap_skel.c => stub.c}      |   2 +-
 scripts/build-run-android.sh                  |   2 +-
 scripts/ggml-hexagon.cfg                      |   4 +-
 12 files changed, 602 insertions(+), 540 deletions(-)
 create mode 100755 ggml/src/ggml-hexagon/kernels/Makefile
 create mode 100644 ggml/src/ggml-hexagon/kernels/add.c
 create mode 100644 ggml/src/ggml-hexagon/kernels/mulmat.c
 rename ggml/src/ggml-hexagon/kernels/{ggmlop_cdsp_skel.c => skel.c} (99%)
 rename ggml/src/ggml-hexagon/kernels/{ggmlop_ap_skel.h => skel.h} (99%)
 rename ggml/src/ggml-hexagon/{kernels/ggmlop_ap_skel.c => stub.c} (99%)

diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index 1dd6cb46a7aa6..6d5392036d203 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -49,17 +49,10 @@ endif()
 set(OPT_FLAG " ")
 if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
     #works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend
-    set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -flto -D_GNU_SOURCE -fvectorize -fno-finite-math-only -ffp-model=fast ")
+    set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -flto -D_GNU_SOURCE -fvectorize -ffp-model=fast -fno-finite-math-only")
 endif()
 message("OPT_FLAG:${OPT_FLAG}")
 
-#cross compiling for hexagon kernels on cDSP side
-set(HEXAGON_CC              "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang")
-set(HEXAGON_CXX             "${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang")
-set(HEXAGON_TARGET          libggmlop_skel${HTP_ARCH_VERSION}.so)
-set(HEXAGON_KERNELS_PATH    "${CMAKE_CURRENT_LIST_DIR}/kernels")
-set(HEXAGON_COMPUTE         "compute${HTP_ARCH_VERSION}")
-
 if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     find_library(LOG_LIB log)
 
@@ -91,13 +84,12 @@ else()
     message(FATAL_ERROR "ggml-hexagon now only available on Android and Windows(Windows on ARM)")
 endif()
 
-
 set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
 set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
 
-file(GLOB HEXAGON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/ggmlop_ap_skel.c")
+file(GLOB HEXAGON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/stub.c")
 ggml_add_backend_library(ggml-hexagon ${HEXAGON_SOURCES})
 
 target_include_directories(ggml-hexagon PRIVATE ${QNN_SDK_PATH}/include/QNN ${HEXAGON_SDK_PATH} ${CMAKE_CURRENT_LIST_DIR})
@@ -106,6 +98,7 @@ target_link_libraries(ggml-hexagon PRIVATE ${QNN_LINK_LIBRARIES})
 string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
 target_compile_definitions(ggml-hexagon PRIVATE QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/")
 
+#cross compiling hexagon kernels which running on cDSP side
 function(ggml_hexagon_build_kernel KNAME)
     message(STATUS "ggml_hexagon: build hexagon-kernel ${KNAME}")
 
@@ -113,11 +106,10 @@ function(ggml_hexagon_build_kernel KNAME)
         TARGET ${PROJECT_NAME}
         POST_BUILD
         COMMAND echo "current working path:`pwd`\n"
-        COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -c ${HEXAGON_KERNELS_PATH}/ggml-dsp.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic ${DEBUG_FLAG} -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -fno-finite-math-only -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/posix -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/qurt/
-        COMMAND ${HEXAGON_CC} -o ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o -c ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.c -m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B -fno-finite-math-only -I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/utils/examples -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc
-        COMMAND ${HEXAGON_CC} -m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${HEXAGON_TARGET} -o ../../../bin/${HEXAGON_TARGET} -Wl,--start-group ${HEXAGON_KERNELS_PATH}/ggmlop_cdsp_skel.o ${HEXAGON_KERNELS_PATH}/ggml-dsp.o -Wl,--end-group
-        COMMAND ls -l ../../../bin/${HEXAGON_TARGET}
-        COMMAND /bin/cp -fv ../../../bin/${HEXAGON_TARGET} ../../../bin/libggmlop_skel.so
+        COMMAND echo "${CMAKE_CURRENT_LIST_DIR}/kernels"
+        COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ clean
+        COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ HEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} HTP_ARCH_VERSION=${HTP_ARCH_VERSION} DEBUG_FLAG=${DEBUG_FLAG}
+        COMMAND ls -l  ../../../bin/libggmlop_skel.so
         COMMENT "build hexagon-kernel"
     )
 endfunction()
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 070450a0de7d7..48b019f918a08 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -121,7 +121,7 @@
 #include "ggml-impl.h"
 #include "ggml-backend-impl.h"
 
-#include "kernels/ggmlop_ap_skel.h"
+#include "kernels/skel.h"
 
 // =================================================================================================
 //  section-1: forward/prototype declaration, global vars, macros, data structures
@@ -383,8 +383,8 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
 #elif defined(_WIN32)
         .qnn_runtimelib_path    = "C:\\",
 #endif
-        .ggml_hexagon_version   = {"1.04"},
-        .ggml_dsp_version       = {"0.61"},
+        .ggml_hexagon_version   = {"1.05"},
+        .ggml_dsp_version       = {"0.62"},
 };
 
 //file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
diff --git a/ggml/src/ggml-hexagon/kernels/Makefile b/ggml/src/ggml-hexagon/kernels/Makefile
new file mode 100755
index 0000000000000..559a8b8aec0fb
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/Makefile
@@ -0,0 +1,39 @@
+#following vars already defined in CMakeLists.txt
+#HTP_ARCH_VERSION=v79
+#DEBUG_FLAG=-DNDEBUG -Wall
+#HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
+
+HEXAGON_COMPUTE=compute${HTP_ARCH_VERSION}
+HEXAGON_CC=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang
+HEXAGON_CXX=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang
+
+TARGET=libggmlop_skel.so
+
+$(info HEXAGON_SDK_PATH:${HEXAGON_SDK_PATH})
+$(info HTP_ARCH_VERSION:${HTP_ARCH_VERSION})
+$(info DEBUG_FLAG:${DEBUG_FLAG})
+$(info HEXAGON_COMPUTE:${HEXAGON_COMPUTE})
+
+INCS=-I${HEXAGON_SDK_PATH}/incs -I${HEXAGON_SDK_PATH}/libs/qprintf/inc -I${HEXAGON_SDK_PATH}/incs/stddef -I${HEXAGON_SDK_PATH}/ipc/fastrpc/incs -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rpcmem/inc -I${HEXAGON_SDK_PATH}/ipc/fastrpc/rtld/ship/inc -I${HEXAGON_SDK_PATH}/libs/atomic/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc -I${HEXAGON_SDK_PATH}/utils/sim_utils/inc -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/posix -I${HEXAGON_SDK_PATH}/rtos/qurt/${HEXAGON_COMPUTE}/include/qurt/
+
+CFLAGS=-m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initialized-in-bss -fdata-sections -fpic -D__V_DYNAMIC__ -mhvx -mhvx-length=128B ${INCS} -fno-finite-math-only
+
+LDFLAGS=-m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${TARGET}
+
+SRCS = $(wildcard *.c)
+OBJS = $(patsubst %.c, %.o, $(SRCS))
+
+ALL:$(OBJS)
+		${HEXAGON_CC} ${LDFLAGS} -o ${TARGET} -Wl,--start-group ${OBJS} -Wl,--end-group
+		@ls -l ${TARGET}
+		/bin/cp -fv ${TARGET} ../../../../out/android/bin/
+		/bin/cp -fv ${TARGET} ../../../../out/android/bin/libggmlop_skel${HTP_ARCH_VERSION}.so
+		/bin/rm -f *.so
+
+%.o:%.c
+		@echo "${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<"
+		${HEXAGON_CC} ${CFLAGS} -D__FILENAME__=\"$<\" -o $@ -c $<
+		@echo "\n"
+
+clean:
+	rm -f *.o
diff --git a/ggml/src/ggml-hexagon/kernels/add.c b/ggml/src/ggml-hexagon/kernels/add.c
new file mode 100644
index 0000000000000..36d2e7bd69c57
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/add.c
@@ -0,0 +1,144 @@
+#include "ggml-dsp.h"
+
+inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float * x, const float * y) {
+    HVX_Vector * va;
+    HVX_Vector * vb;
+    HVX_Vector * vc;
+    HVX_Vector qf32;
+    const int FLOATS_PER_VECTOR = 128 / sizeof(float);
+    const int block  = n / FLOATS_PER_VECTOR;
+    const int left   = n % FLOATS_PER_VECTOR;
+    const int blocks = block * FLOATS_PER_VECTOR;
+
+    if (0 == block) {
+        for (size_t i = 0; i < n; ++i)
+            z[i] = x[i] + y[i];
+
+        return;
+    }
+
+    if ((((uintptr_t)z | (uintptr_t)x | (uintptr_t)y) % ALIGN_128_BYTE) != 0) {
+        GGMLHEXAGON_LOG_DEBUG("memaddress mismatch alignment 128 bytes z:%p x:%p y:%p", z, x, y);
+        for (size_t i = 0; i < n; ++i)
+            z[i] = x[i] + y[i];
+
+        return;
+    }
+
+    va = (HVX_Vector *)x;
+    vb = (HVX_Vector *)y;
+    vc = (HVX_Vector *)z;
+    for (size_t i = 0; i < block; ++i) {
+        qf32 = Q6_Vqf32_vadd_VsfVsf(*va++, *vb++);
+        *vc = Q6_Vsf_equals_Vqf32(qf32);
+        vc++;
+    }
+
+    if (left > 0) {
+        for (size_t i = 0; i < left; ++i)
+            z[i + blocks] = x[i + blocks] + y[i + blocks];
+    }
+}
+
+static void ggml_compute_forward_add_f32(
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        struct ggml_tensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    uint64_t start_time = ggml_time_us();
+
+    memcpy(dst->ne, src1->ne, 16);
+    memcpy(dst->nb, src1->nb, 16);
+    ggmlhexagon_dump_tensor(src0, 1);
+    ggmlhexagon_dump_tensor(src1, 1);
+    ggmlhexagon_dump_tensor(dst, 1);
+
+    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
+
+    const int ith = 0;
+    const int nth = 1;
+
+    const int nr  = ggml_nrows(src0);
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    GGML_ASSERT( nb0 == sizeof(float));
+    GGML_ASSERT(nb00 == sizeof(float));
+
+    const int dr = (nr + nth - 1)/nth;
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    if (nb10 == sizeof(float)) {
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int32_t i03 = ir/(ne02*ne01);
+            const int32_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int32_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int32_t i13 = i03 % ne13;
+            const int32_t i12 = i02 % ne12;
+            const int32_t i11 = i01 % ne11;
+            const int32_t nr0 = ne00 / ne10;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
+            for (int32_t r = 0; r < nr0; ++r) {
+                ggmlhexagon_dsp_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
+            }
+        }
+    } else {
+        // src1 is not contiguous
+        for (int ir = ir0; ir < ir1; ++ir) {
+            // src1 is broadcastable across src0 and dst in i1, i2, i3
+            const int32_t i03 = ir/(ne02*ne01);
+            const int32_t i02 = (ir - i03*ne02*ne01)/ne01;
+            const int32_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
+
+            const int32_t i13 = i03 % ne13;
+            const int32_t i12 = i02 % ne12;
+            const int32_t i11 = i01 % ne11;
+
+            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
+            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
+
+            for (int32_t i0 = 0; i0 < ne0; ++i0) {
+                const int32_t i10 = i0 % ne10;
+                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
+
+                dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
+            }
+        }
+    }
+
+    uint64_t end_time = ggml_time_us();
+    uint64_t duration = (end_time - start_time);
+    GGMLHEXAGON_LOG_DEBUG("duration %llu us", duration);
+#if !GGMLHEXAGON_DEBUG
+    UNUSED(duration);
+#endif
+
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+}
+
+//FIXME: why failed with test-backend-ops when disable ion rpc mempool
+int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst)
+{
+    GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+        {
+            if (src1->type == GGML_TYPE_F32) {
+                ggml_compute_forward_add_f32(src0, src1, dst);
+            } else {
+                GGML_ABORT("fatal error");
+            }
+            break;
+        }
+        default:
+        {
+            GGML_ABORT("fatal error");
+        }
+    }
+    GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__);
+    return 0;
+}
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
index e79341ed27569..d58e8ec8c5a74 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -1,53 +1,9 @@
-/*
-* Copyright (c) 2023-2025 The ggml authors
-*
-* Permission is hereby granted, free of charge, to any person obtaining a copy
-* of this software and associated documentation files (the "Software"), to
-* deal in the Software without restriction, including without limitation the
-* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-* sell copies of the Software, and to permit persons to whom the Software is
-* furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice shall be included in
-* all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*/
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <stdbool.h>
-#include <stdint.h>
-#include <assert.h>
-
-#include "HAP_perf.h"
-#include "HAP_farf.h"
-#include "HAP_power.h"
-#include "HAP_vtcm_mgr.h"
-#include "HAP_compute_res.h"
-
-#include "qurt.h"
-
-#include "AEEStdErr.h"
-#include "hexagon_types.h"
-#include "hexagon_protos.h"
-
-#include "ggmlop_ap_skel.h"
 #include "ggml-dsp.h"
 
 // =================================================================================================
-//  section-1: forward/prototype declaration,global vars,macros,data structures
+//  section-1: tiny ggml-dsp, ported from original ggml
 // =================================================================================================
-#define ggml_tensor         dsptensor
 
-static size_t ggml_nbytes(const struct ggml_tensor * tensor);
-static void   ggmlhexagon_log_internal(int level, const char * file, const char * func, int line, const char * format, ...);
 static void   ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
 
 static void   dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
@@ -57,9 +13,11 @@ static void   ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs,
 
 static float ggml_table_f32_f16[1 << 16];
 
-static struct ggml_compute_params params;
+static struct ggml_compute_params g_params;
 
-static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
+static int32 g_thread_counts = 1;
+
+struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
         [GGML_TYPE_F32] = {
                 .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
                 .vec_dot_type             = GGML_TYPE_F32,
@@ -297,10 +255,7 @@ static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
 
 };
 
-// =================================================================================================
-//  section-2: ggml-hexagon kernel's internal troubleshooting function
-// =================================================================================================
-static void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) {
+void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) {
 #if !GGMLHEXAGON_DEBUG
     return;
 #endif
@@ -317,7 +272,7 @@ static void ggmlhexagon_log_internal(int level, const char *file, const char *fu
     va_end(args);
 }
 
-static void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) {
+void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) {
     //return;
     float value = 0;
     char tmpbuf[GGMLHEXAGON_LOGBUF_LEN];
@@ -342,7 +297,7 @@ static void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) {
     GGMLHEXAGON_LOG_DEBUG("\n");
 }
 
-static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data) {
+void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data) {
     GGMLHEXAGON_LOG_DEBUG("ne = %5d x %5d x %5d x %5d , nb = (%5zi, %5zi, %5zi, %5zi)\n",
          tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->ne[3],
          tensor->nb[0], tensor->nb[1], tensor->nb[2], tensor->nb[3]);
@@ -352,14 +307,11 @@ static void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_
     }
 }
 
-// =================================================================================================
-//  section-3: tiny ggml-dsp: a customized ggml on Hexagon cDSP, ported from original ggml
-// =================================================================================================
 static const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
     return &type_traits_cpu[type];
 }
 
-static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x,
+void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x,
                              size_t bx, const float *GGML_RESTRICT y, size_t by, int nrc) {
     assert(nrc == 1);
     UNUSED(nrc);
@@ -373,36 +325,36 @@ static void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const fl
     *s = sumf;
 }
 
-inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) {
+inline void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) {
     for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];
 }
 
-inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) {
+inline void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) {
     for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];
 }
 
-inline static void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) {
+inline void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) {
     for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i];
 }
 
-static const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
+const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
     return &type_traits[type];
 }
 
-static int64_t ggml_blck_size(enum ggml_type type) {
+int64_t ggml_blck_size(enum ggml_type type) {
     return type_traits[type].blck_size;
 }
 
-static size_t ggml_type_size(enum ggml_type type) {
+size_t ggml_type_size(enum ggml_type type) {
     return type_traits[type].type_size;
 }
 
-static size_t ggml_row_size(enum ggml_type type, int64_t ne) {
+size_t ggml_row_size(enum ggml_type type, int64_t ne) {
     assert(ne % ggml_blck_size(type) == 0);
     return ggml_type_size(type)*ne/ggml_blck_size(type);
 }
 
-static size_t ggml_nbytes(const struct ggml_tensor * tensor) {
+size_t ggml_nbytes(const struct ggml_tensor * tensor) {
     size_t nbytes;
     const size_t blck_size = ggml_blck_size(tensor->type);
     if (blck_size == 1) {
@@ -421,23 +373,23 @@ static size_t ggml_nbytes(const struct ggml_tensor * tensor) {
     return nbytes;
 }
 
-static size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
+size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
     return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
 }
 
-static double ggml_type_sizef(enum ggml_type type) {
+double ggml_type_sizef(enum ggml_type type) {
     return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
 }
 
-static const char * ggml_type_name(enum ggml_type type) {
+const char * ggml_type_name(enum ggml_type type) {
     return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
 }
 
-static bool ggml_is_quantized(enum ggml_type type) {
+bool ggml_is_quantized(enum ggml_type type) {
     return type_traits[type].is_quantized;
 }
 
-static bool ggml_is_empty(const struct ggml_tensor * tensor) {
+bool ggml_is_empty(const struct ggml_tensor * tensor) {
     for (int i = 0; i < GGML_MAX_DIMS; ++i) {
         if (tensor->ne[i] == 0) {
             return true;
@@ -446,7 +398,7 @@ static bool ggml_is_empty(const struct ggml_tensor * tensor) {
     return false;
 }
 
-static bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return ggml_is_empty(t0) ? ggml_is_empty(t1) :
@@ -456,7 +408,7 @@ static bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_ten
            (t1->ne[3]%t0->ne[3] == 0);
 }
 
-static bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
+bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
     return
             (t0->ne[0] == t1->ne[0]) &&
@@ -465,13 +417,13 @@ static bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml
             (t0->ne[3] == t1->ne[3]);
 }
 
-static int64_t ggml_nrows(const struct ggml_tensor * tensor) {
+int64_t ggml_nrows(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
 }
 
-static bool ggml_is_transposed(const struct ggml_tensor * tensor) {
+bool ggml_is_transposed(const struct ggml_tensor * tensor) {
     return tensor->nb[0] > tensor->nb[1];
 }
 
@@ -497,7 +449,7 @@ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
     return true;
 }
 
-static int64_t ggml_nelements(const struct ggml_tensor * tensor) {
+ int64_t ggml_nelements(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
@@ -507,7 +459,7 @@ static bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
     return ggml_is_contiguous_n(tensor, 0);
 }
 
-static bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
+ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
     return ggml_is_contiguous_0(tensor);
 }
 
@@ -515,10 +467,9 @@ inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, co
     for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i];
 }
 
-static void ggml_abort(const char * file, int line, const char * fmt, ...) {
+void ggml_abort(const char * file, int line, const char * fmt, ...) {
     GGMLHEXAGON_LOG_DEBUG("enter ggml_abort");
     abort();
-    return;
 }
 
 // FP16 <-> FP32
@@ -563,7 +514,7 @@ static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
     return fp32_from_bits(result);
 }
 
-static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+ inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
 #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
     const float scale_to_inf = 0x1.0p+112f;
     const float scale_to_zero = 0x1.0p-110f;
@@ -589,7 +540,7 @@ static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
     return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
 }
 
-static inline float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
+ inline float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
     uint16_t s;
     memcpy(&s, &f, sizeof(uint16_t));
     return ggml_table_f32_f16[s];
@@ -605,18 +556,18 @@ static inline void ggml_init(void) {
     }
 
     //FIXME:HVX multithreading should be utilized in hexagon-kernels
-    params.ith = 0;
-    params.nth = 1;
+    g_params.ith = 0;
+    g_params.nth = 1;
     //FIXME:hardcode buffer size
-    params.wsize = 512 * 1024 * 1024;
-    params.wdata = (char*)malloc(params.wsize);
-    GGML_ASSERT(NULL != params.wdata);
+    g_params.wsize = 512 * 1024 * 1024;
+    g_params.wdata = (char*)malloc(g_params.wsize);
+    GGML_ASSERT(NULL != g_params.wdata);
 }
 
 static inline void ggml_deinit(void) {
-    free(params.wdata);
-    params.wdata = NULL;
-    params.wsize = 0;
+    free(g_params.wdata);
+    g_params.wdata = NULL;
+    g_params.wsize = 0;
 }
 
 static inline int nearest_int(float fval) {
@@ -864,24 +815,42 @@ static inline uint64 hexagon_perf_get_time_us(void) {
     return (uint64)(count) * 10ull / 192ull;
 }
 
-static void ggml_time_init(void) {
+void ggml_time_init(void) {
 }
 
-static int64_t ggml_time_ms(void) {
+int64_t ggml_time_ms(void) {
     return hexagon_perf_get_time_us() * 1000;
 }
 
-static int64_t ggml_time_us(void) {
+int64_t ggml_time_us(void) {
     return hexagon_perf_get_time_us();
 }
 
 // =================================================================================================
-//  section-4: ggml-hexagon kernel helper function
+//  ggml-hexagon kernel helper function
+// =================================================================================================
+int ggmlop_get_thread_counts(void) {
+    return g_thread_counts;
+}
+
+struct ggml_compute_params * ggmlop_get_params(void) {
+     return &g_params;
+}
+
+int ggml_get_params_size(void) {
+     return g_params.wsize;
+}
+
+char * ggml_get_params_data(void) {
+     return g_params.wdata;
+}
+
+// =================================================================================================
+//  implementation of ggml-hexagon kernel skel function
 // =================================================================================================
-static int32 g_thread_counts = 1;
 int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
     void *tptr = NULL;
-    FARF(HIGH, "uri %s", uri);
+    GGMLHEXAGON_LOG_DEBUG("uri %s", uri);
     tptr = (void *)malloc(1);
     *handle = (remote_handle64)tptr;
     assert(*handle);
@@ -962,446 +931,33 @@ AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32
     return AEE_SUCCESS;
 }
 
+
 // =================================================================================================
-//  section-5: ggml-hexagon kernel functions: offload ggmlop to cDSP through Hexagon C API and SIMD instructions
+//  implementation of ggml-hexagon kernel, it's better to put every kernel to a single file
 // =================================================================================================
-inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float * x, const float * y) {
-    HVX_Vector * va;
-    HVX_Vector * vb;
-    HVX_Vector * vc;
-    HVX_Vector qf32;
-    const int FLOATS_PER_VECTOR = 128 / sizeof(float);
-    const int block  = n / FLOATS_PER_VECTOR;
-    const int left   = n % FLOATS_PER_VECTOR;
-    const int blocks = block * FLOATS_PER_VECTOR;
-
-    if (0 == block) {
-        for (size_t i = 0; i < n; ++i)
-            z[i] = x[i] + y[i];
-
-        return;
-    }
-
-    if ((((uintptr_t)z | (uintptr_t)x | (uintptr_t)y) % ALIGN_128_BYTE) != 0) {
-        GGMLHEXAGON_LOG_DEBUG("memaddress mismatch alignment 128 bytes z:%p x:%p y:%p", z, x, y);
-        for (size_t i = 0; i < n; ++i)
-            z[i] = x[i] + y[i];
-
-        return;
-    }
-
-    va = (HVX_Vector *)x;
-    vb = (HVX_Vector *)y;
-    vc = (HVX_Vector *)z;
-    for (size_t i = 0; i < block; ++i) {
-        qf32 = Q6_Vqf32_vadd_VsfVsf(*va++, *vb++);
-        *vc = Q6_Vsf_equals_Vqf32(qf32);
-        vc++;
-    }
-
-    if (left > 0) {
-        for (size_t i = 0; i < left; ++i)
-            z[i + blocks] = x[i + blocks] + y[i + blocks];
-    }
-}
-
-static void ggml_compute_forward_add_f32(
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        struct ggml_tensor * dst) {
+int ggmlop_dsp_softmax(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
-    uint64_t start_time = ggml_time_us();
-
-    memcpy(dst->ne, src1->ne, 16);
-    memcpy(dst->nb, src1->nb, 16);
-    ggmlhexagon_dump_tensor(src0, 1);
-    ggmlhexagon_dump_tensor(src1, 1);
-    ggmlhexagon_dump_tensor(dst, 1);
-
-    GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
-
-    const int ith = 0;
-    const int nth = 1;
-
-    const int nr  = ggml_nrows(src0);
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    GGML_ASSERT( nb0 == sizeof(float));
-    GGML_ASSERT(nb00 == sizeof(float));
-
-    const int dr = (nr + nth - 1)/nth;
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-    if (nb10 == sizeof(float)) {
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int32_t i03 = ir/(ne02*ne01);
-            const int32_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int32_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int32_t i13 = i03 % ne13;
-            const int32_t i12 = i02 % ne12;
-            const int32_t i11 = i01 % ne11;
-            const int32_t nr0 = ne00 / ne10;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-            float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
-            for (int32_t r = 0; r < nr0; ++r) {
-                ggmlhexagon_dsp_add_f32(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
-            }
-        }
-    } else {
-        // src1 is not contiguous
-        for (int ir = ir0; ir < ir1; ++ir) {
-            // src1 is broadcastable across src0 and dst in i1, i2, i3
-            const int32_t i03 = ir/(ne02*ne01);
-            const int32_t i02 = (ir - i03*ne02*ne01)/ne01;
-            const int32_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
-
-            const int32_t i13 = i03 % ne13;
-            const int32_t i12 = i02 % ne12;
-            const int32_t i11 = i01 % ne11;
-
-            float * dst_ptr  = (float *) ((char *) dst->data  + i03*nb3  + i02*nb2  + i01*nb1 );
-            float * src0_ptr = (float *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
-
-            for (int32_t i0 = 0; i0 < ne0; ++i0) {
-                const int32_t i10 = i0 % ne10;
-                float * src1_ptr = (float *) ((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + i10*nb10);
-
-                dst_ptr[i0] = src0_ptr[i0] + *src1_ptr;
-            }
-        }
-    }
 
-    uint64_t end_time = ggml_time_us();
-    uint64_t duration = (end_time - start_time);
-    GGMLHEXAGON_LOG_DEBUG("duration %llu us", duration);
-#if !GGMLHEXAGON_DEBUG
-    UNUSED(duration);
-#endif
 
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-}
 
-//FIXME: failed with test-backend-ops when disable ion rpc mempool
-int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst)
-{
-    GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-        {
-            if (src1->type == GGML_TYPE_F32) {
-                ggml_compute_forward_add_f32(src0, src1, dst);
-            } else {
-                GGML_ABORT("fatal error");
-            }
-            break;
-        }
-        default:
-        {
-            GGML_ABORT("fatal error");
-        }
-    }
-    GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__);
     return 0;
 }
 
-static void ggml_compute_forward_mul_mat_one_chunk(
-        const struct ggml_compute_params * params,
-        const ggml_tensor * src0,
-        const ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        const enum ggml_type type,
-        const int32_t num_rows_per_vec_dot,
-        const int32_t ir0_start,
-        const int32_t ir0_end,
-        const int32_t ir1_start,
-        const int32_t ir1_end) {
-    ggmlhexagon_dump_tensor(src0, 0);
-    ggmlhexagon_dump_tensor(src1, 0);
-    ggmlhexagon_dump_tensor(dst, 0);
-
-    dst->ne[0] = src0->ne[1];
-    dst->ne[1] = src1->ne[1];
-    dst->ne[2] = src1->ne[2];
-    dst->ne[3] = src1->ne[3];
-
-    dst->nb[0] = ggml_type_size(src1->type);
-    dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type));
-    dst->nb[2] = dst->nb[1] * dst->ne[1];
-    dst->nb[3] = dst->nb[2] * dst->ne[2];
-    ggmlhexagon_dump_tensor(dst, 0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    const bool src1_cont = ggml_is_contiguous(src1);
-
-    ggml_vec_dot_t const vec_dot      = type_traits_cpu[type].vec_dot;
-    enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
-
-    // broadcast factors
-    const int32_t r2 = ne12 / ne02;
-    const int32_t r3 = ne13 / ne03;
-
-    if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
-        return;
-    }
-
-    const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
-
-    assert(ne12 % ne02 == 0);
-    assert(ne13 % ne03 == 0);
-
-    // block-tiling attempt
-    const int32_t blck_0 = 16;
-    const int32_t blck_1 = 16;
-
-    const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
-
-    // attempt to reduce false-sharing (does not seem to make a difference)
-    // 16 * 2, accounting for mmla kernels
-    float tmp[32];
-
-    for (int32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
-        for (int32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
-            for (int32_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
-                const int32_t i13 = (ir1 / (ne12 * ne1));
-                const int32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
-                const int32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
-
-                // broadcast src0 into src1
-                const int32_t i03 = i13 / r3;
-                const int32_t i02 = i12 / r2;
-
-                const int32_t i1 = i11;
-                const int32_t i2 = i12;
-                const int32_t i3 = i13;
-
-                const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
-
-                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
-                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
-                //       the original src1 data pointer, so we should index using the indices directly
-                // TODO: this is a bit of a hack, we should probably have a better way to handle this
-                const char * src1_col = (const char*)wdata +
-                                        (src1_cont || src1->type != vec_dot_type
-                                         ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
-                                         : (i11 * nb11 + i12 * nb12 + i13 * nb13));
-                float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
-
-                //for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
-                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
-                //}
-
-                for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
-                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
-                }
-
-                for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
-                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
-                }
-            }
-        }
-    }
-}
-
-//FIXME: only support fp32 mulmat on cDSP
-static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+int ggmlop_dsp_rmsnorm(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
-    ggmlhexagon_dump_tensor(src0, 0);
-    ggmlhexagon_dump_tensor(src1, 0);
-    ggmlhexagon_dump_tensor(dst, 0);
-
-    dst->ne[0] = src0->ne[1];
-    dst->ne[1] = src1->ne[1];
-    dst->ne[2] = src1->ne[2];
-    dst->ne[3] = src1->ne[3];
-
-    dst->nb[0] = ggml_type_size(src1->type);
-    dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type));
-    dst->nb[2] = dst->nb[1] * dst->ne[1];
-    dst->nb[3] = dst->nb[2] * dst->ne[2];
-    ggmlhexagon_dump_tensor(dst, 0);
-
-    GGML_TENSOR_BINARY_OP_LOCALS
-
-    enum ggml_type           const vec_dot_type         = type_traits_cpu[src0->type].vec_dot_type;
-    ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
-    int32_t                  const vec_dot_num_rows     = type_traits_cpu[src0->type].nrows;
-    const int ith = 0;
-    const int nth = 1;
-
-    GGML_ASSERT(ne0 == ne01);
-    GGML_ASSERT(ne1 == ne11);
-    GGML_ASSERT(ne2 == ne12);
-    GGML_ASSERT(ne3 == ne13);
-
-    // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(src0->type));
-    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
-
-    // dst cannot be transposed or permuted
-    GGML_ASSERT(nb0 == sizeof(float));
-    GGML_ASSERT(nb0 <= nb1);
-    GGML_ASSERT(nb1 <= nb2);
-    GGML_ASSERT(nb2 <= nb3);
-
-#if 0 //naive algorithm for fp32, can pass various case in UT
-    {
-        //ggml_dump_tensor(src0);
-        //ggml_dump_tensor(src1);
-
-        float * a = (float*)src0->data;
-        float * b = (float*)src1->data;
-        float * c = (float*)dst->data;
-        int M = src0->ne[1];
-        int K = src0->ne[0];
-        int N = src1->ne[1];
-        float sum = 0;
-        for (int i = 0; i < M; i++) {
-            for (int j = 0; j < N; j++) {
-                sum = 0;
-                for (int h = 0; h < K; h++) {
-                    sum += a[i * K + h] * b[h * N + j];
-                }
-                c[i * N + j] = sum;
-            }
-        }
-        return 0;
-    }
-#endif
-
-    if (src1->type != vec_dot_type) {
-        size_t wsize = ggml_row_size(vec_dot_type, ggml_nelements(src1));
-        GGML_ASSERT(wsize < params.wsize);
-    }
-
-    if (src1->type != vec_dot_type) {
-        char * wdata = params.wdata;
-
-        const size_t nbw0 = ggml_type_size(vec_dot_type);
-        const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
-        const size_t nbw2 = nbw1*ne11;
-        const size_t nbw3 = nbw2*ne12;
-
-        assert(params.wsize >= ne13*nbw3);
-        GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-        for (int64_t i13 = 0; i13 < ne13; ++i13) {
-            for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                    size_t bs = ggml_blck_size(vec_dot_type);
-                    int64_t ne10_block_start = (ith * ne10/bs) / nth;
-                    int64_t ne10_block_end   = ((ith + 1) * ne10/bs) / nth;
-                    from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
-                               (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
-                               (ne10_block_end - ne10_block_start) * bs);
-                }
-            }
-        }
-    }
-
-    // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
-    const int32_t nr0 = ne0;
-
-    // This is the size of the rest of the dimensions of the result
-    const int32_t nr1 = ne1 * ne2 * ne3;
-
-    // Now select a reasonable chunk size.
-    int chunk_size = 16;
-
-    // We need to step up the size if it's small
-    if (nr0 == 1 || nr1 == 1) {
-        chunk_size = 64;
-    }
-
-    // distribute the work across the inner or outer loop based on which one is larger
-    // The number of chunks in the 0/1 dim.
-    // CEIL(nr0/chunk_size)
-    int32_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
-    int32_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
-
-    // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
-    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggml-org/llama.cpp/pull/6915
-    //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
-    if (nchunk0 * nchunk1 <  4) {
-        // distribute the thread work across the inner or outer loop based on which one is larger
-        nchunk0 =  1; // parallelize by src0 rows
-        nchunk1 =  1; // parallelize by src1 rows
-    }
-
-    // The number of elements in each chunk
-    const int32_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
-    const int32_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
-
-    // The first chunk comes from our thread_id, the rest will get auto-assigned.
-    int current_chunk = 0;
-
-    while (current_chunk < nchunk0 * nchunk1) {
-        const int32_t ith0 = current_chunk % nchunk0;
-        const int32_t ith1 = current_chunk / nchunk0;
-
-        const int32_t ir0_start = dr0 * ith0;
-        const int32_t ir0_end = MIN(ir0_start + dr0, nr0);
 
-        const int32_t ir1_start = dr1 * ith1;
-        const int32_t ir1_end = MIN(ir1_start + dr1, nr1);
-
-        // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
-        int32_t num_rows_per_vec_dot = vec_dot_num_rows;
-
-        // these checks are needed to avoid crossing dim1 boundaries
-        // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
-        if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
-            num_rows_per_vec_dot = 1;
-        }
-        ggml_compute_forward_mul_mat_one_chunk(&params, src0, src1, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
-
-        if (1 >= nchunk0 * nchunk1) {
-            break;
-        }
-        current_chunk++;
-    }
-
-    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-    return 0;
-}
-
-int ggmlop_dsp_mulmat_multithread(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-    return 0;
-}
 
-int ggmlop_dsp_mulmat(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) {
-    if (g_thread_counts > 1) {
-        return ggmlop_dsp_mulmat_multithread(h, src0, src1, dst);
-    } else {
-        return ggmlop_dsp_mulmat_singlethread(h, src0, src1, dst);
-    }
     return 0;
 }
 
-int ggmlop_dsp_softmax(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
-
+int ggmlop_dsp_pool2d(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
-    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-    return 0;
-}
 
-int ggmlop_dsp_rmsnorm(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
 
-    return 0;
-}
 
-int ggmlop_dsp_pool2d(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
-
-    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
-    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
     return 0;
 }
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
index c77e45391205e..cc399ac3c2dd5 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
@@ -5,12 +5,28 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
+#include <math.h>
 #include <assert.h>
 
+#include "HAP_perf.h"
+#include "HAP_farf.h"
+#include "HAP_power.h"
+#include "HAP_vtcm_mgr.h"
+#include "HAP_compute_res.h"
+
+#include "qurt.h"
+#include "AEEStdErr.h"
+#include "hexagon_types.h"
+#include "hexagon_protos.h"
+
+#include "skel.h"
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
 
+#define ggml_tensor         dsptensor
+
 #define GGML_MAX_DIMS       4
 
 #define ALIGN_128_BYTE      128
@@ -51,7 +67,7 @@ extern "C" {
 #define GGML_FP16_TO_FP32(x)            ggml_lookup_fp16_to_fp32(x)
 
 //NPU performance will be slower when enable GGMLHEXAGON_DEBUG
-#ifdef NDEBUG
+#if 1//def NDEBUG
 #define GGMLHEXAGON_DEBUG                                   0
 #else
 #define GGMLHEXAGON_DEBUG                                   1
@@ -323,6 +339,43 @@ struct ggml_type_traits_cpu {
     int64_t                  nrows; // number of rows to process simultaneously
 };
 
+void ggml_time_init(void);
+int64_t ggml_time_ms(void);
+int64_t ggml_time_us(void);
+
+size_t ggml_type_size(enum ggml_type type);
+int64_t ggml_blck_size(enum ggml_type type);
+
+inline float ggml_lookup_fp16_to_fp32(ggml_fp16_t f);
+inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f);
+
+int64_t ggml_nrows(const struct ggml_tensor * tensor);
+bool ggml_is_transposed(const struct ggml_tensor * tensor);
+
+bool ggml_is_empty(const struct ggml_tensor * tensor);
+size_t ggml_nbytes(const struct ggml_tensor * tensor);
+
+bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+
+void ggml_abort(const char * file, int line, const char * fmt, ...);
+bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+
+size_t ggml_row_size(enum ggml_type type, int64_t ne);
+int64_t ggml_nelements(const struct ggml_tensor * tensor);
+bool ggml_is_contiguous(const struct ggml_tensor * tensor);
+
+void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor);
+void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data);
+void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...);
+
+int ggmlop_get_thread_counts(void);
+int ggml_get_params_size(void);
+char * ggml_get_params_data(void);
+struct ggml_compute_params * ggmlop_get_params(void);
+
+extern struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT];
+
 #ifdef  __cplusplus
 }
 #endif
diff --git a/ggml/src/ggml-hexagon/kernels/mulmat.c b/ggml/src/ggml-hexagon/kernels/mulmat.c
new file mode 100644
index 0000000000000..4c4ad2ba75ea5
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/mulmat.c
@@ -0,0 +1,278 @@
+#include "ggml-dsp.h"
+
+static void ggml_compute_forward_mul_mat_one_chunk(
+        const struct ggml_compute_params * params,
+        const ggml_tensor * src0,
+        const ggml_tensor * src1,
+        struct ggml_tensor * dst,
+        const enum ggml_type type,
+        const int32_t num_rows_per_vec_dot,
+        const int32_t ir0_start,
+        const int32_t ir0_end,
+        const int32_t ir1_start,
+        const int32_t ir1_end) {
+    ggmlhexagon_dump_tensor(src0, 0);
+    ggmlhexagon_dump_tensor(src1, 0);
+    ggmlhexagon_dump_tensor(dst, 0);
+
+    dst->ne[0] = src0->ne[1];
+    dst->ne[1] = src1->ne[1];
+    dst->ne[2] = src1->ne[2];
+    dst->ne[3] = src1->ne[3];
+
+    dst->nb[0] = ggml_type_size(src1->type);
+    dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type));
+    dst->nb[2] = dst->nb[1] * dst->ne[1];
+    dst->nb[3] = dst->nb[2] * dst->ne[2];
+    ggmlhexagon_dump_tensor(dst, 0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    const bool src1_cont = ggml_is_contiguous(src1);
+
+    ggml_vec_dot_t const vec_dot      = type_traits_cpu[type].vec_dot;
+    enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
+
+    // broadcast factors
+    const int32_t r2 = ne12 / ne02;
+    const int32_t r3 = ne13 / ne03;
+
+    if (ir0_start >= ir0_end || ir1_start >= ir1_end) {
+        return;
+    }
+
+    const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
+    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+
+    assert(ne12 % ne02 == 0);
+    assert(ne13 % ne03 == 0);
+
+    // block-tiling attempt
+    const int32_t blck_0 = 16;
+    const int32_t blck_1 = 16;
+
+    const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
+
+    // attempt to reduce false-sharing (does not seem to make a difference)
+    // 16 * 2, accounting for mmla kernels
+    float tmp[32];
+
+    for (int32_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {
+        for (int32_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {
+            for (int32_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ir1 += num_rows_per_vec_dot) {
+                const int32_t i13 = (ir1 / (ne12 * ne1));
+                const int32_t i12 = (ir1 - i13 * ne12 * ne1) / ne1;
+                const int32_t i11 = (ir1 - i13 * ne12 * ne1 - i12 * ne1);
+
+                // broadcast src0 into src1
+                const int32_t i03 = i13 / r3;
+                const int32_t i02 = i12 / r2;
+
+                const int32_t i1 = i11;
+                const int32_t i2 = i12;
+                const int32_t i3 = i13;
+
+                const char * src0_row = (const char*)src0->data + (0 + i02 * nb02 + i03 * nb03);
+
+                // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
+                //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
+                //       the original src1 data pointer, so we should index using the indices directly
+                // TODO: this is a bit of a hack, we should probably have a better way to handle this
+                const char * src1_col = (const char*)wdata +
+                                        (src1_cont || src1->type != vec_dot_type
+                                         ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
+                                         : (i11 * nb11 + i12 * nb12 + i13 * nb13));
+                float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
+
+                //for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
+                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
+                //}
+
+                for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
+                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
+                }
+
+                for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
+                    memcpy(&dst_col[iir0 + cn * nb1 / nb0], tmp + (cn * 16), (MIN(iir0 + blck_0, ir0_end) - iir0) * sizeof(float));
+                }
+            }
+        }
+    }
+}
+
+//FIXME: only support fp32 mulmat on cDSP
+static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    ggmlhexagon_dump_tensor(src0, 0);
+    ggmlhexagon_dump_tensor(src1, 0);
+    ggmlhexagon_dump_tensor(dst, 0);
+
+    dst->ne[0] = src0->ne[1];
+    dst->ne[1] = src1->ne[1];
+    dst->ne[2] = src1->ne[2];
+    dst->ne[3] = src1->ne[3];
+
+    dst->nb[0] = ggml_type_size(src1->type);
+    dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type));
+    dst->nb[2] = dst->nb[1] * dst->ne[1];
+    dst->nb[3] = dst->nb[2] * dst->ne[2];
+    ggmlhexagon_dump_tensor(dst, 0);
+
+    GGML_TENSOR_BINARY_OP_LOCALS
+
+    enum ggml_type           const vec_dot_type         = type_traits_cpu[src0->type].vec_dot_type;
+    ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
+    int32_t                  const vec_dot_num_rows     = type_traits_cpu[src0->type].nrows;
+    const int ith = 0;
+    const int nth = 1;
+
+    GGML_ASSERT(ne0 == ne01);
+    GGML_ASSERT(ne1 == ne11);
+    GGML_ASSERT(ne2 == ne12);
+    GGML_ASSERT(ne3 == ne13);
+
+    // we don't support permuted src0 or src1
+    GGML_ASSERT(nb00 == ggml_type_size(src0->type));
+    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+    // dst cannot be transposed or permuted
+    GGML_ASSERT(nb0 == sizeof(float));
+    GGML_ASSERT(nb0 <= nb1);
+    GGML_ASSERT(nb1 <= nb2);
+    GGML_ASSERT(nb2 <= nb3);
+
+#if 0 //naive algorithm for fp32, can pass various case in UT
+    {
+        //ggml_dump_tensor(src0);
+        //ggml_dump_tensor(src1);
+
+        float * a = (float*)src0->data;
+        float * b = (float*)src1->data;
+        float * c = (float*)dst->data;
+        int M = src0->ne[1];
+        int K = src0->ne[0];
+        int N = src1->ne[1];
+        float sum = 0;
+        for (int i = 0; i < M; i++) {
+            for (int j = 0; j < N; j++) {
+                sum = 0;
+                for (int h = 0; h < K; h++) {
+                    sum += a[i * K + h] * b[h * N + j];
+                }
+                c[i * N + j] = sum;
+            }
+        }
+        return 0;
+    }
+#endif
+
+    if (src1->type != vec_dot_type) {
+        size_t wsize = ggml_row_size(vec_dot_type, ggml_nelements(src1));
+        GGML_ASSERT(wsize < ggml_get_params_size());
+    }
+
+    if (src1->type != vec_dot_type) {
+        char * wdata = ggml_get_params_data();
+
+        const size_t nbw0 = ggml_type_size(vec_dot_type);
+        const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
+        const size_t nbw2 = nbw1*ne11;
+        const size_t nbw3 = nbw2*ne12;
+
+        assert(ggml_get_params_size() >= ne13*nbw3);
+        GGML_ASSERT(src1->type == GGML_TYPE_F32);
+
+        for (int64_t i13 = 0; i13 < ne13; ++i13) {
+            for (int64_t i12 = 0; i12 < ne12; ++i12) {
+                for (int64_t i11 = 0; i11 < ne11; ++i11) {
+                    size_t bs = ggml_blck_size(vec_dot_type);
+                    int64_t ne10_block_start = (ith * ne10/bs) / nth;
+                    int64_t ne10_block_end   = ((ith + 1) * ne10/bs) / nth;
+                    from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
+                               (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
+                               (ne10_block_end - ne10_block_start) * bs);
+                }
+            }
+        }
+    }
+
+    // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
+    const int32_t nr0 = ne0;
+
+    // This is the size of the rest of the dimensions of the result
+    const int32_t nr1 = ne1 * ne2 * ne3;
+
+    // Now select a reasonable chunk size.
+    int chunk_size = 16;
+
+    // We need to step up the size if it's small
+    if (nr0 == 1 || nr1 == 1) {
+        chunk_size = 64;
+    }
+
+    // distribute the work across the inner or outer loop based on which one is larger
+    // The number of chunks in the 0/1 dim.
+    // CEIL(nr0/chunk_size)
+    int32_t nchunk0 = (nr0 + chunk_size - 1) / chunk_size;
+    int32_t nchunk1 = (nr1 + chunk_size - 1) / chunk_size;
+
+    // If the chunking is poor for the number of threads on this setup, scrap the whole plan.  Re-chunk it by thread.
+    //   Also, chunking by thread was measured to have perform better on NUMA systems.  See https://github.com/ggml-org/llama.cpp/pull/6915
+    //   In theory, chunking should be just as useful on NUMA and non NUMA systems, but testing disagreed with that.
+    if (nchunk0 * nchunk1 <  4) {
+        // distribute the thread work across the inner or outer loop based on which one is larger
+        nchunk0 =  1; // parallelize by src0 rows
+        nchunk1 =  1; // parallelize by src1 rows
+    }
+
+    // The number of elements in each chunk
+    const int32_t dr0 = (nr0 + nchunk0 - 1) / nchunk0;
+    const int32_t dr1 = (nr1 + nchunk1 - 1) / nchunk1;
+
+    // The first chunk comes from our thread_id, the rest will get auto-assigned.
+    int current_chunk = 0;
+
+    while (current_chunk < nchunk0 * nchunk1) {
+        const int32_t ith0 = current_chunk % nchunk0;
+        const int32_t ith1 = current_chunk / nchunk0;
+
+        const int32_t ir0_start = dr0 * ith0;
+        const int32_t ir0_end = MIN(ir0_start + dr0, nr0);
+
+        const int32_t ir1_start = dr1 * ith1;
+        const int32_t ir1_end = MIN(ir1_start + dr1, nr1);
+
+        // dot kernels can handle 1 row and col at a time, but mmla kernels can process 2 rows and cols
+        int32_t num_rows_per_vec_dot = vec_dot_num_rows;
+
+        // these checks are needed to avoid crossing dim1 boundaries
+        // can be optimized, but the logic would become more complicated, so keeping it like this for simplicity
+        if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
+            num_rows_per_vec_dot = 1;
+        }
+        ggml_compute_forward_mul_mat_one_chunk(ggmlop_get_params(), src0, src1, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
+
+        if (1 >= nchunk0 * nchunk1) {
+            break;
+        }
+        current_chunk++;
+    }
+
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+static int ggmlop_dsp_mulmat_multithread(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+int ggmlop_dsp_mulmat(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) {
+    if (ggmlop_get_thread_counts() > 1) {
+        return ggmlop_dsp_mulmat_multithread(h, src0, src1, dst);
+    } else {
+        return ggmlop_dsp_mulmat_singlethread(h, src0, src1, dst);
+    }
+    return 0;
+}
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c b/ggml/src/ggml-hexagon/kernels/skel.c
similarity index 99%
rename from ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c
rename to ggml/src/ggml-hexagon/kernels/skel.c
index 1e9d31a72319d..6ddf1ef485f91 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_cdsp_skel.c
+++ b/ggml/src/ggml-hexagon/kernels/skel.c
@@ -1,6 +1,6 @@
 //qidl copyright
 //qidl nested=false
-#include "ggmlop_ap_skel.h"
+#include "skel.h"
 
 #include <string.h>
 #ifndef _WIN32
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h b/ggml/src/ggml-hexagon/kernels/skel.h
similarity index 99%
rename from ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h
rename to ggml/src/ggml-hexagon/kernels/skel.h
index f189c48d0238b..e2f34c7999bcc 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.h
+++ b/ggml/src/ggml-hexagon/kernels/skel.h
@@ -1,5 +1,5 @@
-#ifndef _GGMLOP_H
-#define _GGMLOP_H
+#ifndef _SKEL_H
+#define _SKEL_H
 //qidl copyright
 //qidl nested=false
 #include <AEEStdDef.h>
@@ -284,4 +284,4 @@ __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_pool2d)(remote_handle64 _h, co
 #ifdef __cplusplus
 }
 #endif
-#endif //_GGMLOP_H
+#endif //_SKEL_H
diff --git a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c b/ggml/src/ggml-hexagon/stub.c
similarity index 99%
rename from ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
rename to ggml/src/ggml-hexagon/stub.c
index b0a660ce96a79..6074d243610df 100644
--- a/ggml/src/ggml-hexagon/kernels/ggmlop_ap_skel.c
+++ b/ggml/src/ggml-hexagon/stub.c
@@ -1,6 +1,6 @@
 //qidl copyright
 //qidl nested=false
-#include "ggmlop_ap_skel.h"
+#include "skel.h"
 #include <string.h>
 #ifndef _WIN32
 #include "HAP_farf.h"
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 6a1c35cbdcc6d..2dd784058abed 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -25,7 +25,7 @@ ANDROID_NDK=${PWD}/${ANDROID_NDK_NAME}
 #QNN SDK can be found at:
 #https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
 QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
-QNN_SDK_INSTALL_PATH=/opt/qcom/aistack/qairt/
+QNN_SDK_INSTALL_PATH=/opt/qcom/aistack/qairt
 QNN_SDK_VERSION=2.32.0.250228
 QNN_SDK_VERSION=2.33.0.250327
 QNN_SDK_PATH=${QNN_SDK_INSTALL_PATH}/${QNN_SDK_VERSION}
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index cf5d1f49b2ece..654ae34a452a2 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -23,9 +23,9 @@
 #
 [general]
 #version of ggml-hexagon.cpp on ARM-AP side
-version = "1.04"
+version = "1.05"
 #version of ggml-dsp.c on cDSP side
-ggmldsp_version = "0.61"
+ggmldsp_version = "0.62"
 
 #0: HEXAGON_BACKEND_QNNCPU
 #1: HEXAGON_BACKEND_QNNGPU

From bcb50125315506ef40302078f99e30c2499bb3ed Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sat, 12 Apr 2025 09:44:27 +0800
Subject: [PATCH 186/200] ggml-dsp: refine ggml-dsp and make ggml-dsp more
 clear

---
 ggml/src/ggml-hexagon/CMakeLists.txt       | 7 ++++---
 ggml/src/ggml-hexagon/kernels/Makefile     | 9 +++++----
 ggml/src/ggml-hexagon/kernels/ggml-dsp.h   | 2 +-
 ggml/src/ggml-hexagon/kernels/skel.c       | 2 +-
 ggml/src/ggml-hexagon/kernels/skel.h       | 2 +-
 ggml/src/ggml-hexagon/{ => kernels}/stub.c | 0
 6 files changed, 12 insertions(+), 10 deletions(-)
 rename ggml/src/ggml-hexagon/{ => kernels}/stub.c (100%)

diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index 6d5392036d203..0a4082d8b4f3a 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -89,7 +89,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_F
 set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
 
-file(GLOB HEXAGON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/stub.c")
+file(GLOB HEXAGON_SOURCES "${CMAKE_CURRENT_LIST_DIR}/*.cpp" "${CMAKE_CURRENT_LIST_DIR}/kernels/stub.c")
 ggml_add_backend_library(ggml-hexagon ${HEXAGON_SOURCES})
 
 target_include_directories(ggml-hexagon PRIVATE ${QNN_SDK_PATH}/include/QNN ${HEXAGON_SDK_PATH} ${CMAKE_CURRENT_LIST_DIR})
@@ -98,7 +98,7 @@ target_link_libraries(ggml-hexagon PRIVATE ${QNN_LINK_LIBRARIES})
 string(REGEX REPLACE "/$" "" QNN_DEFAULT_LIB_SEARCH_PATH "${QNN_DEFAULT_LIB_SEARCH_PATH}")
 target_compile_definitions(ggml-hexagon PRIVATE QNN_DEFAULT_LIB_SEARCH_PATH="${QNN_DEFAULT_LIB_SEARCH_PATH}/")
 
-#cross compiling hexagon kernels which running on cDSP side
+#cross compiling source codes of hexagon kernels which running on cDSP side
 function(ggml_hexagon_build_kernel KNAME)
     message(STATUS "ggml_hexagon: build hexagon-kernel ${KNAME}")
 
@@ -109,7 +109,8 @@ function(ggml_hexagon_build_kernel KNAME)
         COMMAND echo "${CMAKE_CURRENT_LIST_DIR}/kernels"
         COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ clean
         COMMAND make -C ${CMAKE_CURRENT_LIST_DIR}/kernels/ HEXAGON_SDK_PATH=${HEXAGON_SDK_PATH} HTP_ARCH_VERSION=${HTP_ARCH_VERSION} DEBUG_FLAG=${DEBUG_FLAG}
-        COMMAND ls -l  ../../../bin/libggmlop_skel.so
+        COMMAND echo "current working path:`pwd`\n"
+        COMMAND ls -l  ../../../bin/libggmlop-skel.so
         COMMENT "build hexagon-kernel"
     )
 endfunction()
diff --git a/ggml/src/ggml-hexagon/kernels/Makefile b/ggml/src/ggml-hexagon/kernels/Makefile
index 559a8b8aec0fb..b513a3f118d28 100755
--- a/ggml/src/ggml-hexagon/kernels/Makefile
+++ b/ggml/src/ggml-hexagon/kernels/Makefile
@@ -7,7 +7,7 @@ HEXAGON_COMPUTE=compute${HTP_ARCH_VERSION}
 HEXAGON_CC=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang
 HEXAGON_CXX=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang
 
-TARGET=libggmlop_skel.so
+TARGET=libggmlop-skel.so
 
 $(info HEXAGON_SDK_PATH:${HEXAGON_SDK_PATH})
 $(info HTP_ARCH_VERSION:${HTP_ARCH_VERSION})
@@ -20,19 +20,20 @@ CFLAGS=-m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initi
 
 LDFLAGS=-m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${TARGET}
 
-SRCS = $(wildcard *.c)
+#SRCS = $(wildcard *.c)
+SRCS = ggml-dsp.c skel.c add.c  mulmat.c
 OBJS = $(patsubst %.c, %.o, $(SRCS))
 
 ALL:$(OBJS)
 		${HEXAGON_CC} ${LDFLAGS} -o ${TARGET} -Wl,--start-group ${OBJS} -Wl,--end-group
 		@ls -l ${TARGET}
 		/bin/cp -fv ${TARGET} ../../../../out/android/bin/
-		/bin/cp -fv ${TARGET} ../../../../out/android/bin/libggmlop_skel${HTP_ARCH_VERSION}.so
+		/bin/cp -fv ${TARGET} ../../../../out/android/bin/libggmlop-skel${HTP_ARCH_VERSION}.so
 		/bin/rm -f *.so
 
 %.o:%.c
 		@echo "${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<"
-		${HEXAGON_CC} ${CFLAGS} -D__FILENAME__=\"$<\" -o $@ -c $<
+		${HEXAGON_CC} ${CFLAGS} ${DEBUG_FLAG} -D__FILENAME__=\"$<\" -o $@ -c $<
 		@echo "\n"
 
 clean:
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
index cc399ac3c2dd5..1a9b32858efd3 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
@@ -67,7 +67,7 @@ extern "C" {
 #define GGML_FP16_TO_FP32(x)            ggml_lookup_fp16_to_fp32(x)
 
 //NPU performance will be slower when enable GGMLHEXAGON_DEBUG
-#if 1//def NDEBUG
+#ifdef NDEBUG
 #define GGMLHEXAGON_DEBUG                                   0
 #else
 #define GGMLHEXAGON_DEBUG                                   1
diff --git a/ggml/src/ggml-hexagon/kernels/skel.c b/ggml/src/ggml-hexagon/kernels/skel.c
index 6ddf1ef485f91..26da58273f013 100644
--- a/ggml/src/ggml-hexagon/kernels/skel.c
+++ b/ggml/src/ggml-hexagon/kernels/skel.c
@@ -290,7 +290,7 @@ extern int adsp_mmap_fd_getinfo(int, uint32_t *);
 extern "C" {
 #endif
 _ATTRIBUTE_VISIBILITY uint32_t ggmlop_skel_handle_invoke_qaic_version = 10048;
-_ATTRIBUTE_VISIBILITY char ggmlop_skel_handle_invoke_uri[77+1]="file:///libggmlop_skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1";
+_ATTRIBUTE_VISIBILITY char ggmlop_skel_handle_invoke_uri[77+1]="file:///libggmlop-skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1";
 static __inline int _skel_pack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
    int _nErr = 0;
    remote_arg* _praROutPostStart = _praROutPost;
diff --git a/ggml/src/ggml-hexagon/kernels/skel.h b/ggml/src/ggml-hexagon/kernels/skel.h
index e2f34c7999bcc..194c71e6ecb2a 100644
--- a/ggml/src/ggml-hexagon/kernels/skel.h
+++ b/ggml/src/ggml-hexagon/kernels/skel.h
@@ -279,7 +279,7 @@ __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_softmax)(remote_handle64 _h, c
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_rmsnorm)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
 __QAIC_HEADER_EXPORT int __QAIC_HEADER(ggmlop_dsp_pool2d)(remote_handle64 _h, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_HEADER_ATTRIBUTE;
 #ifndef ggmlop_URI
-#define ggmlop_URI "file:///libggmlop_skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1"
+#define ggmlop_URI "file:///libggmlop-skel.so?ggmlop_skel_handle_invoke&_modver=1.0&_idlver=0.0.1"
 #endif /*ggmlop_URI*/
 #ifdef __cplusplus
 }
diff --git a/ggml/src/ggml-hexagon/stub.c b/ggml/src/ggml-hexagon/kernels/stub.c
similarity index 100%
rename from ggml/src/ggml-hexagon/stub.c
rename to ggml/src/ggml-hexagon/kernels/stub.c

From 69315107d6124c5455d26a6b0274d52af711dea1 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sat, 12 Apr 2025 14:37:15 +0800
Subject: [PATCH 187/200] ggml-hexagon: fix a minior issue in dev ops

---
 ggml/src/ggml-hexagon/CMakeLists.txt   | 12 ++++++++++++
 ggml/src/ggml-hexagon/ggml-hexagon.cpp |  2 +-
 scripts/build-run-android.sh           |  5 +++++
 scripts/ggml-hexagon.cfg               | 12 +++++++-----
 4 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index 0a4082d8b4f3a..c1393adb27b49 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -115,4 +115,16 @@ function(ggml_hexagon_build_kernel KNAME)
     )
 endfunction()
 
+function(ggml_hexagon_setup_cfg KNAME)
+    message(STATUS "ggml_hexagon: setup runtime configuration file ${KNAME}")
+    add_custom_command(
+        TARGET ${PROJECT_NAME}
+        POST_BUILD
+        COMMAND echo "current working path:`pwd`\n"
+        COMMAND /bin/cp -fv ../../../../../scripts/${KNAME}  ../../../bin/
+        COMMENT "setup runtime configuration file"
+    )
+endfunction()
+
 ggml_hexagon_build_kernel("cdsp")
+ggml_hexagon_setup_cfg("ggml-hexagon.cfg")
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 48b019f918a08..3bcdbda72f512 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1830,7 +1830,7 @@ static void ggmlhexagon_load_cfg() {
     });
     std::string precision_mode;
     std::string ggml_hexagon_version;
-    hexagoncfg_instance.get_stringvalue("general", "ggml_hexagon_version", ggml_hexagon_version, "1.03");
+    hexagoncfg_instance.get_stringvalue("general", "version", ggml_hexagon_version, "1.00");
     hexagoncfg_instance.get_intvalue("general", "enable_perf", g_hexagon_appcfg.enable_perf, 1);
     hexagoncfg_instance.get_intvalue("general", "print_tensors_info", g_hexagon_appcfg.print_tensors_info, 0);
     hexagoncfg_instance.get_intvalue("general", "dump_op_info", g_hexagon_appcfg.dump_op_info, 0);
diff --git a/scripts/build-run-android.sh b/scripts/build-run-android.sh
index 2dd784058abed..da06a881b1ac5 100755
--- a/scripts/build-run-android.sh
+++ b/scripts/build-run-android.sh
@@ -12,6 +12,7 @@ PWD=`pwd`
 REMOTE_PATH=/data/local/tmp/
 #LLM model file on Android phone
 GGUF_MODEL_NAME=/sdcard/gemma-3-4b-it-Q8_0.gguf
+#https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/blob/main/qwen1_5-1_8b-chat-q4_0.gguf
 GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
 
 #Android NDK can be found at:
@@ -241,6 +242,10 @@ function prepare_run_on_phone()
         adb push ./out/android/bin/*.so ${REMOTE_PATH}/
     fi
     adb push ./out/android/bin/${program} ${REMOTE_PATH}/
+    #for non developers: deploy dev ops once time with build outputs in ./out/android/bin/
+    #adb push ./out/android/bin/ggml-hexagon.cfg ${REMOTE_PATH}/
+    #for developers: modify ./scritps/ggml-hexagon.cfg before run
+    adb push ./scripts/ggml-hexagon.cfg ${REMOTE_PATH}/
     adb shell chmod +x ${REMOTE_PATH}/${program}
 }
 
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index 654ae34a452a2..3ce07156c7c90 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -40,14 +40,16 @@ hwaccel_approach = 2
 #attention:
 #          a. HWACCEL_QNN_SINGLEGRAPH not supported at the moment;
 #          b. following combinations are valid:
-#             1: hwaccel_approach = 2 AND hexagon_backend = 2(this is the default setting)
-#             2: hwaccel_approach = 0 AND hexagon_backend = 2(QNNNPU)
+#             1: hwaccel_approach = 2 AND hexagon_backend = 2(HWACCEL_CDSP, this is the default setting)
+#             2: hwaccel_approach = 0 AND hexagon_backend = 2(HWACCEL_QNN, QNNNPU)
 #             3: hwaccel_approach = 0 AND hexagon_backend = 1(QNNGPU)
 #             4: hwaccel_approach = 0 AND hexagon_backend = 0(QNNCPU)
-#             5: hwaccel_approach = 2 AND hexagon_backend = 3
-#             6: hwaccel_approach = 0 AND hexagon_backend = 3
+#             5: hwaccel_approach = 2 AND hexagon_backend = 3(fall back to the default ggml backend)
+#             6: hwaccel_approach = 0 AND hexagon_backend = 3(fall back to the default ggml backend)
 #
-#generally speaking, we only need to focus on b-1 and b-2 in this PR.
+#generally speaking,
+#          a. we only need to focus on b-1 and b-2 in this PR.
+#          b. we can compare Hexagon NPU performance between HWACCEL_CDSP/HWACCEL_QNN(QNNNPU)/the default ggml backend accordingly
 
 
 #enable/disable offload quantized type mulmat

From c45cd5e67f684f01dda2ea26d105ac71fc8f3cea Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sat, 12 Apr 2025 16:27:49 +0800
Subject: [PATCH 188/200] ggml-hexagon: fix a build issue in CI

---
 CMakeLists.txt           | 20 ++++++++++++--------
 scripts/ggml-hexagon.cfg |  4 ++--
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 35d955065ce53..f33ca3208068d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,14 +7,18 @@ set(CMAKE_WARN_UNUSED_CLI YES)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
-    #works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend
-    set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -flto -D_GNU_SOURCE -fvectorize -ffp-model=fast -fno-finite-math-only")
-    message("OPT_FLAG:${OPT_FLAG}")
-    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
-    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+if(CMAKE_SYSTEM_NAME STREQUAL "Android")
+    if(DEFINED HTP_ARCH_VERSION)
+        if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
+            #works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend
+            set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -flto -D_GNU_SOURCE -fvectorize -ffp-model=fast -fno-finite-math-only")
+            message("OPT_FLAG:${OPT_FLAG}")
+            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+            set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+            set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -DGGML_USE_HEXAGON ${DEBUG_FLAG} ${OPT_FLAG}")
+        endif()
+    endif()
 endif()
 
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index 3ce07156c7c90..67bacdd1a3c72 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -41,14 +41,14 @@ hwaccel_approach = 2
 #          a. HWACCEL_QNN_SINGLEGRAPH not supported at the moment;
 #          b. following combinations are valid:
 #             1: hwaccel_approach = 2 AND hexagon_backend = 2(HWACCEL_CDSP, this is the default setting)
-#             2: hwaccel_approach = 0 AND hexagon_backend = 2(HWACCEL_QNN, QNNNPU)
+#             2: hwaccel_approach = 0 AND hexagon_backend = 2(HWACCEL_QNN, aka QNNNPU)
 #             3: hwaccel_approach = 0 AND hexagon_backend = 1(QNNGPU)
 #             4: hwaccel_approach = 0 AND hexagon_backend = 0(QNNCPU)
 #             5: hwaccel_approach = 2 AND hexagon_backend = 3(fall back to the default ggml backend)
 #             6: hwaccel_approach = 0 AND hexagon_backend = 3(fall back to the default ggml backend)
 #
 #generally speaking,
-#          a. we only need to focus on b-1 and b-2 in this PR.
+#          a. we only need to focus on b-1(HWACCEL_CDSP) and b-2(HWACCEL_QNN, aka QNNNPU).
 #          b. we can compare Hexagon NPU performance between HWACCEL_CDSP/HWACCEL_QNN(QNNNPU)/the default ggml backend accordingly
 
 
From 7b55a4662bb56fa444f8aff6c032ef1675c9c785 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 15 Apr 2025 14:55:02 +0800
Subject: [PATCH 189/200] ggml-dsp: cleanup code

---
 ggml/src/ggml-hexagon/kernels/ggml-dsp.c | 4 +++-
 ggml/src/ggml-hexagon/kernels/mulmat.c   | 1 -
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
index d58e8ec8c5a74..74e3c68d06f5f 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -273,7 +273,9 @@ void ggmlhexagon_log_internal(int level, const char *file, const char *func, int
 }
 
 void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) {
-    //return;
+#if !GGMLHEXAGON_DEBUG
+    return;
+#endif
     float value = 0;
     char tmpbuf[GGMLHEXAGON_LOGBUF_LEN];
     size_t buflen = 0;
diff --git a/ggml/src/ggml-hexagon/kernels/mulmat.c b/ggml/src/ggml-hexagon/kernels/mulmat.c
index 4c4ad2ba75ea5..273a3c3dc65b0 100644
--- a/ggml/src/ggml-hexagon/kernels/mulmat.c
+++ b/ggml/src/ggml-hexagon/kernels/mulmat.c
@@ -274,5 +274,4 @@ int ggmlop_dsp_mulmat(remote_handle64 h, const struct dsptensor * src0, const st
     } else {
         return ggmlop_dsp_mulmat_singlethread(h, src0, src1, dst);
     }
-    return 0;
 }

From 6f118978223d90eae85ffa09e212a256d68e3770 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 15 Apr 2025 23:18:05 +0800
Subject: [PATCH 190/200] ggml-hexagon: sync with upstream

---
 ggml/src/ggml-hexagon/kernels/add.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/ggml/src/ggml-hexagon/kernels/add.c b/ggml/src/ggml-hexagon/kernels/add.c
index 36d2e7bd69c57..9d64e21d3918e 100644
--- a/ggml/src/ggml-hexagon/kernels/add.c
+++ b/ggml/src/ggml-hexagon/kernels/add.c
@@ -4,7 +4,6 @@ inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float
     HVX_Vector * va;
     HVX_Vector * vb;
     HVX_Vector * vc;
-    HVX_Vector qf32;
     const int FLOATS_PER_VECTOR = 128 / sizeof(float);
     const int block  = n / FLOATS_PER_VECTOR;
     const int left   = n % FLOATS_PER_VECTOR;
@@ -29,9 +28,7 @@ inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float
     vb = (HVX_Vector *)y;
     vc = (HVX_Vector *)z;
     for (size_t i = 0; i < block; ++i) {
-        qf32 = Q6_Vqf32_vadd_VsfVsf(*va++, *vb++);
-        *vc = Q6_Vsf_equals_Vqf32(qf32);
-        vc++;
+        *vc++ = Q6_Vsf_vadd_VsfVsf(*va++, *vb++);
     }
 
     if (left > 0) {

From 157b6b1ba8a7f822966c60a8284ae89ffc0af0f1 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 16 Apr 2025 17:09:29 +0800
Subject: [PATCH 191/200] ggml-dsp: cleanup code

---
 ggml/src/ggml-hexagon/kernels/ggml-dsp.c | 707 +----------------------
 ggml/src/ggml-hexagon/kernels/ggml-dsp.h | 250 +-------
 ggml/src/ggml-hexagon/kernels/mulmat.c   | 145 ++---
 3 files changed, 93 insertions(+), 1009 deletions(-)

diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
index 74e3c68d06f5f..025d6c99173e1 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -1,260 +1,10 @@
 #include "ggml-dsp.h"
 
 // =================================================================================================
-//  section-1: tiny ggml-dsp, ported from original ggml
+// tiny ggml-dsp, ported from original ggml
 // =================================================================================================
-
-static void   ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x, size_t bx, const float * GGML_RESTRICT y, size_t by, int nrc);
-
-static void   dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-static void   quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
-static void   quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k);
-static void   ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
-
-static float ggml_table_f32_f16[1 << 16];
-
-static struct ggml_compute_params g_params;
-
 static int32 g_thread_counts = 1;
 
-struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
-        [GGML_TYPE_F32] = {
-                .vec_dot                  = (ggml_vec_dot_t) ggml_vec_dot_f32,
-                .vec_dot_type             = GGML_TYPE_F32,
-                .nrows                    = 1,
-        },
-        [GGML_TYPE_F16] = {
-                .from_float               = NULL,
-                .vec_dot                  = NULL,
-                .vec_dot_type             = GGML_TYPE_F16,
-                .nrows                    = 1,
-        },
-        [GGML_TYPE_Q4_0] = {
-                .from_float               = NULL,
-                .vec_dot                  = NULL,
-                .vec_dot_type             = GGML_TYPE_Q8_0,
-#if defined (__ARM_FEATURE_MATMUL_INT8)
-                .nrows                    = 2,
-#else
-                .nrows                    = 1,
-#endif
-        },
-        [GGML_TYPE_Q4_1] = {
-                .from_float               = NULL,
-                .vec_dot                  = NULL,
-                .vec_dot_type             = GGML_TYPE_Q8_1,
-#if defined (__ARM_FEATURE_MATMUL_INT8)
-                .nrows                    = 2,
-#else
-                .nrows                    = 1,
-#endif
-        },
-        [GGML_TYPE_Q5_0] = {
-                .from_float               = NULL,
-                .vec_dot                  = NULL,
-                .vec_dot_type             = GGML_TYPE_Q8_0,
-                .nrows                    = 1,
-        },
-        [GGML_TYPE_Q5_1] = {
-                .from_float               = NULL,
-                .vec_dot                  = NULL,
-                .vec_dot_type             = GGML_TYPE_Q8_1,
-                .nrows                    = 1,
-        },
-        [GGML_TYPE_Q8_0] = {
-                .from_float               = NULL,
-                .vec_dot                  = NULL,
-                .vec_dot_type             = GGML_TYPE_Q8_0,
-#if defined (__ARM_FEATURE_MATMUL_INT8)
-                .nrows                    = 2,
-#else
-                .nrows                    = 1,
-#endif
-        },
-        [GGML_TYPE_Q8_1] = {
-                .from_float               = NULL,
-                .vec_dot_type             = GGML_TYPE_Q8_1,
-                .nrows                    = 1,
-        },
-        [GGML_TYPE_Q2_K] = {
-                .from_float               = NULL,
-                .vec_dot                  = NULL,
-                .vec_dot_type             = GGML_TYPE_Q8_K,
-                .nrows                    = 1,
-        },
-        [GGML_TYPE_Q3_K] = {
-                .from_float               = NULL,
-                .vec_dot                  = NULL,
-                .vec_dot_type             = GGML_TYPE_Q8_K,
-                .nrows                    = 1,
-        },
-        [GGML_TYPE_Q4_K] = {
-                .from_float               = NULL,
-                .vec_dot                  = NULL,
-                .vec_dot_type             = GGML_TYPE_Q8_K,
-                .nrows                    = 1,
-        },
-        [GGML_TYPE_Q5_K] = {
-                .from_float               = NULL,
-                .vec_dot                  = NULL,
-                .vec_dot_type             = GGML_TYPE_Q8_K,
-                .nrows                    = 1,
-        },
-        [GGML_TYPE_Q6_K] = {
-                .from_float               = quantize_row_q6_K,
-                .vec_dot                  = ggml_vec_dot_q6_K_q8_K,
-                .vec_dot_type             = GGML_TYPE_Q8_K,
-                .nrows                    = 1,
-        },
-};
-
-static const struct ggml_type_traits type_traits[GGML_TYPE_COUNT] = {
-        [GGML_TYPE_I8] = {
-                .type_name                = "i8",
-                .blck_size                = 1,
-                .type_size                = sizeof(int8_t),
-                .is_quantized             = false,
-        },
-        [GGML_TYPE_I16] = {
-                .type_name                = "i16",
-                .blck_size                = 1,
-                .type_size                = sizeof(int16_t),
-                .is_quantized             = false,
-        },
-        [GGML_TYPE_I32] = {
-                .type_name                = "i32",
-                .blck_size                = 1,
-                .type_size                = sizeof(int32_t),
-                .is_quantized             = false,
-        },
-        [GGML_TYPE_I64] = {
-                .type_name                = "i64",
-                .blck_size                = 1,
-                .type_size                = sizeof(int64_t),
-                .is_quantized             = false,
-        },
-        [GGML_TYPE_F64] = {
-                .type_name                = "f64",
-                .blck_size                = 1,
-                .type_size                = sizeof(double),
-                .is_quantized             = false,
-        },
-        [GGML_TYPE_F32] = {
-                .type_name                = "f32",
-                .blck_size                = 1,
-                .type_size                = sizeof(float),
-                .is_quantized             = false,
-        },
-        [GGML_TYPE_F16] = {
-                .type_name                = "f16",
-                .blck_size                = 1,
-                .type_size                = sizeof(ggml_fp16_t),
-                .is_quantized             = false,
-                .to_float                 = NULL,
-                .from_float_ref           = NULL,
-        },
-        [GGML_TYPE_Q4_0] = {
-                .type_name                = "q4_0",
-                .blck_size                = QK4_0,
-                .type_size                = sizeof(block_q4_0),
-                .is_quantized             = true,
-                .to_float                 = NULL,
-                .from_float_ref           = NULL,
-        },
-        [GGML_TYPE_Q4_1] = {
-                .type_name                = "q4_1",
-                .blck_size                = QK4_1,
-                .type_size                = sizeof(block_q4_1),
-                .is_quantized             = true,
-                .to_float                 = NULL,
-                .from_float_ref           = NULL,
-        },
-        [4] = { // GGML_TYPE_Q4_2
-                .type_name                = "DEPRECATED",
-                .blck_size                = 0,
-                .type_size                = 0,
-                .is_quantized             = false,
-        },
-        [5] = { // GGML_TYPE_Q4_3
-                .type_name                = "DEPRECATED",
-                .blck_size                = 0,
-                .type_size                = 0,
-                .is_quantized             = false,
-        },
-        [GGML_TYPE_Q5_0] = {
-                .type_name                = "q5_0",
-                .blck_size                = QK5_0,
-                .type_size                = sizeof(block_q5_0),
-                .is_quantized             = true,
-                .to_float                 = NULL,
-                .from_float_ref           = NULL,
-        },
-        [GGML_TYPE_Q5_1] = {
-                .type_name                = "q5_1",
-                .blck_size                = QK5_1,
-                .type_size                = sizeof(block_q5_1),
-                .is_quantized             = true,
-                .to_float                 = NULL,
-                .from_float_ref           = NULL,
-        },
-        [GGML_TYPE_Q8_0] = {
-                .type_name                = "q8_0",
-                .blck_size                = QK8_0,
-                .type_size                = sizeof(block_q8_0),
-                .is_quantized             = true,
-                .to_float                 = NULL,
-                .from_float_ref           = NULL,
-        },
-        [GGML_TYPE_Q8_1] = {
-                .type_name                = "q8_1",
-                .blck_size                = QK8_1,
-                .type_size                = sizeof(block_q8_1),
-                .is_quantized             = true,
-                .from_float_ref           = NULL,
-        },
-        [GGML_TYPE_Q2_K] = {
-                .type_name                = "q2_K",
-                .blck_size                = QK_K,
-                .type_size                = sizeof(block_q2_K),
-                .is_quantized             = true,
-                .to_float                 = NULL,
-                .from_float_ref           = NULL,
-        },
-        [GGML_TYPE_Q3_K] = {
-                .type_name                = "q3_K",
-                .blck_size                = QK_K,
-                .type_size                = sizeof(block_q3_K),
-                .is_quantized             = true,
-                .to_float                 = NULL,
-                .from_float_ref           = NULL,
-        },
-        [GGML_TYPE_Q4_K] = {
-                .type_name                = "q4_K",
-                .blck_size                = QK_K,
-                .type_size                = sizeof(block_q4_K),
-                .is_quantized             = true,
-                .to_float                 = NULL,
-                .from_float_ref           = NULL,
-        },
-        [GGML_TYPE_Q5_K] = {
-                .type_name                = "q5_K",
-                .blck_size                = QK_K,
-                .type_size                = sizeof(block_q5_K),
-                .is_quantized             = true,
-                .to_float                 = NULL,
-                .from_float_ref           = NULL,
-        },
-        [GGML_TYPE_Q6_K] = {
-                .type_name                = "q6_K",
-                .blck_size                = QK_K,
-                .type_size                = sizeof(block_q6_K),
-                .is_quantized             = true,
-                .to_float                 = (ggml_to_float_t) dequantize_row_q6_K,
-                .from_float_ref           = (ggml_from_float_t) quantize_row_q6_K_ref,
-        },
-
-};
-
 void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) {
 #if !GGMLHEXAGON_DEBUG
     return;
@@ -309,63 +59,19 @@ void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data) {
     }
 }
 
-static const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type) {
-    return &type_traits_cpu[type];
-}
-
-void ggml_vec_dot_f32(int n, float * GGML_RESTRICT s, size_t bs, const float * GGML_RESTRICT x,
-                             size_t bx, const float *GGML_RESTRICT y, size_t by, int nrc) {
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-    ggml_float sumf = 0.0;
-    for (int i = 0; i < n; ++i) {
-        sumf += (ggml_float) (x[i] * y[i]);
-    }
-    *s = sumf;
-}
-
-inline void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) {
-    for (int i = 0; i < n; ++i) z[i]  = x[i]*y[i];
-}
-
-inline void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) {
-    for (int i = 0; i < n; ++i) z[i]  = x[i]/y[i];
-}
-
-inline void ggml_vec_sub_f32 (const int n, float * z, const float * x, const float * y) {
-    for (int i = 0; i < n; ++i) z[i]  = x[i] - y[i];
-}
-
-const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type) {
-    return &type_traits[type];
-}
-
-int64_t ggml_blck_size(enum ggml_type type) {
-    return type_traits[type].blck_size;
-}
-
-size_t ggml_type_size(enum ggml_type type) {
-    return type_traits[type].type_size;
-}
-
 size_t ggml_row_size(enum ggml_type type, int64_t ne) {
-    assert(ne % ggml_blck_size(type) == 0);
-    return ggml_type_size(type)*ne/ggml_blck_size(type);
+    return 4*ne;
 }
 
 size_t ggml_nbytes(const struct ggml_tensor * tensor) {
     size_t nbytes;
-    const size_t blck_size = ggml_blck_size(tensor->type);
+    const size_t blck_size = 1;
     if (blck_size == 1) {
-        nbytes = ggml_type_size(tensor->type);
+        nbytes = 4;
         for (int i = 0; i < GGML_MAX_DIMS; ++i) {
             nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
         }
-    }
-    else {
+    } else {
         nbytes = tensor->ne[0]*tensor->nb[0]/blck_size;
         for (int i = 1; i < GGML_MAX_DIMS; ++i) {
             nbytes += (tensor->ne[i] - 1)*tensor->nb[i];
@@ -375,22 +81,6 @@ size_t ggml_nbytes(const struct ggml_tensor * tensor) {
     return nbytes;
 }
 
-size_t ggml_nbytes_pad(const struct ggml_tensor * tensor) {
-    return GGML_PAD(ggml_nbytes(tensor), GGML_MEM_ALIGN);
-}
-
-double ggml_type_sizef(enum ggml_type type) {
-    return ((double)(type_traits[type].type_size))/type_traits[type].blck_size;
-}
-
-const char * ggml_type_name(enum ggml_type type) {
-    return type < GGML_TYPE_COUNT ? type_traits[type].type_name : "NONE";
-}
-
-bool ggml_is_quantized(enum ggml_type type) {
-    return type_traits[type].is_quantized;
-}
-
 bool ggml_is_empty(const struct ggml_tensor * tensor) {
     for (int i = 0; i < GGML_MAX_DIMS; ++i) {
         if (tensor->ne[i] == 0) {
@@ -429,12 +119,12 @@ bool ggml_is_transposed(const struct ggml_tensor * tensor) {
     return tensor->nb[0] > tensor->nb[1];
 }
 
-static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
-    size_t next_nb = ggml_type_size(tensor->type);
-    if (tensor->ne[0] != ggml_blck_size(tensor->type) && tensor->nb[0] != next_nb) {
+bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
+    size_t next_nb = 4;
+    if (tensor->ne[0] != 1 && tensor->nb[0] != next_nb) {
         return false;
     }
-    next_nb *= tensor->ne[0]/ggml_blck_size(tensor->type);
+    next_nb *= tensor->ne[0];
     for (int i = 1; i < GGML_MAX_DIMS; i++) {
         if (tensor->ne[i] != 1) {
             if (i > n) {
@@ -451,7 +141,7 @@ static bool ggml_is_contiguous_n(const struct ggml_tensor * tensor, int n) {
     return true;
 }
 
- int64_t ggml_nelements(const struct ggml_tensor * tensor) {
+int64_t ggml_nelements(const struct ggml_tensor * tensor) {
     static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
 
     return tensor->ne[0]*tensor->ne[1]*tensor->ne[2]*tensor->ne[3];
@@ -461,365 +151,21 @@ static bool ggml_is_contiguous_0(const struct ggml_tensor * tensor) {
     return ggml_is_contiguous_n(tensor, 0);
 }
 
- bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
+bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
     return ggml_is_contiguous_0(tensor);
 }
 
-inline static void ggml_vec_add_f32 (const int n, float * z, const float * x, const float * y) {
-    for (int i = 0; i < n; ++i) z[i]  = x[i] + y[i];
-}
-
 void ggml_abort(const char * file, int line, const char * fmt, ...) {
     GGMLHEXAGON_LOG_DEBUG("enter ggml_abort");
     abort();
 }
 
-// FP16 <-> FP32
-static inline float fp32_from_bits(uint32_t w) {
-    union {
-        uint32_t as_bits;
-        float as_value;
-    } fp32;
-    fp32.as_bits = w;
-    return fp32.as_value;
-}
-
-static inline uint32_t fp32_to_bits(float f) {
-    union {
-        float as_value;
-        uint32_t as_bits;
-    } fp32;
-    fp32.as_value = f;
-    return fp32.as_bits;
-}
-
-static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
-    const uint32_t w = (uint32_t) h << 16;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    const uint32_t two_w = w + w;
-
-    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
-    const float exp_scale = 0x1.0p-112f;
-#else
-    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-#endif
-    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
-
-    const uint32_t magic_mask = UINT32_C(126) << 23;
-    const float magic_bias = 0.5f;
-    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
-
-    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
-    const uint32_t result = sign |
-                            (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
-    return fp32_from_bits(result);
-}
-
- inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
-    const float scale_to_inf = 0x1.0p+112f;
-    const float scale_to_zero = 0x1.0p-110f;
-#else
-    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
-        const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-#endif
-    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
-
-    const uint32_t w = fp32_to_bits(f);
-    const uint32_t shl1_w = w + w;
-    const uint32_t sign = w & UINT32_C(0x80000000);
-    uint32_t bias = shl1_w & UINT32_C(0xFF000000);
-    if (bias < UINT32_C(0x71000000)) {
-        bias = UINT32_C(0x71000000);
-    }
-
-    base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
-    const uint32_t bits = fp32_to_bits(base);
-    const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
-    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
-    const uint32_t nonsign = exp_bits + mantissa_bits;
-    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-}
-
- inline float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
-    uint16_t s;
-    memcpy(&s, &f, sizeof(uint16_t));
-    return ggml_table_f32_f16[s];
-}
-
-static inline void ggml_init(void) {
-    for (int i = 0; i < (1 << 16); ++i) {
-        union {
-            uint16_t u16;
-            ggml_fp16_t fp16;
-        } u = {i};
-        ggml_table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(u.fp16);
-    }
-
-    //FIXME:HVX multithreading should be utilized in hexagon-kernels
-    g_params.ith = 0;
-    g_params.nth = 1;
-    //FIXME:hardcode buffer size
-    g_params.wsize = 512 * 1024 * 1024;
-    g_params.wdata = (char*)malloc(g_params.wsize);
-    GGML_ASSERT(NULL != g_params.wdata);
-}
-
-static inline void ggml_deinit(void) {
-    free(g_params.wdata);
-    g_params.wdata = NULL;
-    g_params.wsize = 0;
-}
-
-static inline int nearest_int(float fval) {
-    assert(fabsf(fval) <= 4194303.f);
-    float val = fval + 12582912.f;
-    int i; memcpy(&i, &val, sizeof(int));
-    return (i & 0x007fffff) - 0x00400000;
-}
-
-static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type,
-                            const float * GGML_RESTRICT qw) {
-    float max = 0;
-    float amax = 0;
-    for (int i = 0; i < n; ++i) {
-        float ax = fabsf(x[i]);
-        if (ax > amax) { amax = ax; max = x[i]; }
-    }
-    if (amax < GROUP_MAX_EPS) { // all zero
-        for (int i = 0; i < n; ++i) {
-            L[i] = 0;
-        }
-        return 0.f;
-    }
-    float iscale = -nmax / max;
-    if (rmse_type == 0) {
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale * x[i]);
-            L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
-        }
-        return 1/iscale;
-    }
-    bool return_early = false;
-    if (rmse_type < 0) {
-        rmse_type = -rmse_type;
-        return_early = true;
-    }
-    float sumlx = 0;
-    float suml2 = 0;
-#ifdef HAVE_BUGGY_APPLE_LINKER
-    // use 'volatile' to prevent unroll and work around a bug in Apple ld64 1015.7
-    for (volatile int i = 0; i < n; ++i) {
-#else
-    for (int i = 0; i < n; ++i) {
-#endif
-        int l = nearest_int(iscale * x[i]);
-        l = MAX(-nmax, MIN(nmax-1, l));
-        L[i] = l + nmax;
-        float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
-        sumlx += w*x[i]*l;
-        suml2 += w*l*l;
-    }
-    float scale = suml2 ? sumlx/suml2 : 0.0f;
-    if (return_early) return suml2 > 0 ? 0.5f*(scale + 1/iscale) : 1/iscale;
-    float best = scale * sumlx;
-    for (int is = -9; is <= 9; ++is) {
-        if (is == 0) {
-            continue;
-        }
-        iscale = -(nmax + 0.1f*is) / max;
-        sumlx = suml2 = 0;
-        for (int i = 0; i < n; ++i) {
-            int l = nearest_int(iscale * x[i]);
-            l = MAX(-nmax, MIN(nmax-1, l));
-            float w = qw ? qw[i] : rmse_type == 1 ? x[i] * x[i] : rmse_type == 2 ? 1 : rmse_type == 3 ? fabsf(x[i]) : sqrtf(fabsf(x[i]));
-            sumlx += w*x[i]*l;
-            suml2 += w*l*l;
-        }
-        if (suml2 > 0 && sumlx*sumlx > best*suml2) {
-            for (int i = 0; i < n; ++i) {
-                int l = nearest_int(iscale * x[i]);
-                L[i] = nmax + MAX(-nmax, MIN(nmax-1, l));
-            }
-            scale = sumlx/suml2; best = scale*sumlx;
-        }
-    }
-    return scale;
-}
-
-static void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k) {
-    assert(k % QK_K == 0);
-    const int64_t nb = k / QK_K;
-
-    for (int i = 0; i < nb; i++) {
-        const float d = GGML_FP16_TO_FP32(x[i].d);
-
-        const uint8_t * GGML_RESTRICT ql = x[i].ql;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const int8_t  * GGML_RESTRICT sc = x[i].scales;
-
-        for (int n = 0; n < QK_K; n += 128) {
-            for (int l = 0; l < 32; ++l) {
-                int is = l/16;
-                const int8_t q1 = (int8_t)((ql[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
-                const int8_t q2 = (int8_t)((ql[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
-                const int8_t q3 = (int8_t)((ql[l +  0]  >> 4) | (((qh[l] >> 4) & 3) << 4)) - 32;
-                const int8_t q4 = (int8_t)((ql[l + 32]  >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32;
-                y[l +  0] = d * sc[is + 0] * q1;
-                y[l + 32] = d * sc[is + 2] * q2;
-                y[l + 64] = d * sc[is + 4] * q3;
-                y[l + 96] = d * sc[is + 6] * q4;
-            }
-            y  += 128;
-            ql += 64;
-            qh += 32;
-            sc += 8;
-        }
-    }
-}
-
-static void quantize_row_q6_K_ref(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k) {
-    assert(k % QK_K == 0);
-    const int64_t nb = k / QK_K;
-
-    int8_t L[QK_K];
-    float   scales[QK_K/16];
-
-    for (int i = 0; i < nb; i++) {
-
-        float max_scale = 0;
-        float max_abs_scale = 0;
-
-        for (int ib = 0; ib < QK_K/16; ++ib) {
-
-            const float scale = make_qx_quants(16, 32, x + 16*ib, L + 16*ib, 1, NULL);
-            scales[ib] = scale;
-
-            const float abs_scale = fabsf(scale);
-            if (abs_scale > max_abs_scale) {
-                max_abs_scale = abs_scale;
-                max_scale = scale;
-            }
-
-        }
-
-        if (max_abs_scale < GROUP_MAX_EPS) {
-            memset(&y[i], 0, sizeof(block_q6_K));
-            y[i].d = GGML_FP32_TO_FP16(0.f);
-            x += QK_K;
-            continue;
-        }
-
-        float iscale = -128.f/max_scale;
-        y[i].d = GGML_FP32_TO_FP16(1/iscale);
-        for (int ib = 0; ib < QK_K/16; ++ib) {
-            y[i].scales[ib] = MIN(127, nearest_int(iscale*scales[ib]));
-        }
-
-        for (int j = 0; j < QK_K/16; ++j) {
-            float d = GGML_FP16_TO_FP32(y[i].d) * y[i].scales[j];
-            if (!d) {
-                continue;
-            }
-            for (int ii = 0; ii < 16; ++ii) {
-                int l = nearest_int(x[16*j + ii]/d);
-                l = MAX(-32, MIN(31, l));
-                L[16*j + ii] = l + 32;
-            }
-        }
-
-        uint8_t * GGML_RESTRICT ql = y[i].ql;
-        uint8_t * GGML_RESTRICT qh = y[i].qh;
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                const uint8_t q1 = L[j + l +  0] & 0xF;
-                const uint8_t q2 = L[j + l + 32] & 0xF;
-                const uint8_t q3 = L[j + l + 64] & 0xF;
-                const uint8_t q4 = L[j + l + 96] & 0xF;
-                ql[l+ 0] = q1 | (q3 << 4);
-                ql[l+32] = q2 | (q4 << 4);
-                qh[l] = (L[j + l] >> 4) | ((L[j + l + 32] >> 4) << 2) | ((L[j + l + 64] >> 4) << 4) | ((L[j + l + 96] >> 4) << 6);
-            }
-            ql += 64;
-            qh += 32;
-        }
-
-        x += QK_K;
-    }
-}
-
-static void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int64_t k) {
-    assert(k % QK_K == 0);
-    block_q6_K * GGML_RESTRICT y = vy;
-    quantize_row_q6_K_ref(x, y, k);
-}
-
-static void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc) {
-    assert(n % QK_K == 0);
-    assert(nrc == 1);
-    UNUSED(nrc);
-    UNUSED(bx);
-    UNUSED(by);
-    UNUSED(bs);
-
-    const block_q6_K * GGML_RESTRICT x = vx;
-    const block_q8_K * GGML_RESTRICT y = vy;
-
-    const int nb = n / QK_K;
-
-    int8_t  aux8[QK_K];
-    int16_t aux16[8];
-    float   sums [8];
-    int32_t aux32[8];
-    memset(sums, 0, 8*sizeof(float));
-
-    float sumf = 0;
-    for (int i = 0; i < nb; ++i) {
-        const uint8_t * GGML_RESTRICT q4 = x[i].ql;
-        const uint8_t * GGML_RESTRICT qh = x[i].qh;
-        const  int8_t * GGML_RESTRICT q8 = y[i].qs;
-        memset(aux32, 0, 8*sizeof(int32_t));
-        int8_t * GGML_RESTRICT a = aux8;
-        for (int j = 0; j < QK_K; j += 128) {
-            for (int l = 0; l < 32; ++l) {
-                a[l +  0] = (int8_t)((q4[l +  0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32;
-                a[l + 32] = (int8_t)((q4[l + 32] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32;
-                a[l + 64] = (int8_t)((q4[l +  0] >>  4) | (((qh[l] >> 4) & 3) << 4)) - 32;
-                a[l + 96] = (int8_t)((q4[l + 32] >>  4) | (((qh[l] >> 6) & 3) << 4)) - 32;
-            }
-            a  += 128;
-            q4 += 64;
-            qh += 32;
-        }
-        a = aux8;
-        int is = 0;
-        for (int j = 0; j < QK_K/16; ++j) {
-            int scale = x[i].scales[is++];
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-            for (int l = 0; l < 8; ++l) aux16[l] = q8[l] * a[l];
-            for (int l = 0; l < 8; ++l) aux32[l] += scale * aux16[l];
-            q8 += 8; a += 8;
-        }
-        const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d;
-        for (int l = 0; l < 8; ++l) sums[l] += d * aux32[l];
-    }
-    for (int l = 0; l < 8; ++l) sumf += sums[l];
-    *s = sumf;
-
-}
-
 static inline uint64 hexagon_perf_get_time_us(void) {
     unsigned long long count;
     asm volatile (" %0 = c31:30 " : "=r"(count));
     return (uint64)(count) * 10ull / 192ull;
 }
 
-void ggml_time_init(void) {
-}
-
 int64_t ggml_time_ms(void) {
     return hexagon_perf_get_time_us() * 1000;
 }
@@ -828,25 +174,10 @@ int64_t ggml_time_us(void) {
     return hexagon_perf_get_time_us();
 }
 
-// =================================================================================================
-//  ggml-hexagon kernel helper function
-// =================================================================================================
 int ggmlop_get_thread_counts(void) {
     return g_thread_counts;
 }
 
-struct ggml_compute_params * ggmlop_get_params(void) {
-     return &g_params;
-}
-
-int ggml_get_params_size(void) {
-     return g_params.wsize;
-}
-
-char * ggml_get_params_data(void) {
-     return g_params.wdata;
-}
-
 // =================================================================================================
 //  implementation of ggml-hexagon kernel skel function
 // =================================================================================================
@@ -857,8 +188,6 @@ int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
     *handle = (remote_handle64)tptr;
     assert(*handle);
 
-    ggml_init();
-
     GGMLHEXAGON_LOG_DEBUG("api_version = 0x%x", qurt_api_version());
     GGMLHEXAGON_LOG_DEBUG("hvx units = 0x%d", qurt_hvx_get_units());
     qurt_arch_version_t  vers;
@@ -878,8 +207,6 @@ int ggmlop_dsp_close(remote_handle64 handle) {
     if (handle)
         free((void*)handle);
 
-    ggml_deinit();
-
     return 0;
 }
 
@@ -933,33 +260,23 @@ AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32
     return AEE_SUCCESS;
 }
 
-
 // =================================================================================================
-//  implementation of ggml-hexagon kernel, it's better to put every kernel to a single file
+//  implementation of ggml-hexagon kernel, it's better to put every hexagon-kernel to a single file
 // =================================================================================================
 int ggmlop_dsp_softmax(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
-
-
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-
     return 0;
 }
 
 int ggmlop_dsp_rmsnorm(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
-
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-
     return 0;
 }
 
 int ggmlop_dsp_pool2d(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
-
-
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-
-
     return 0;
 }
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
index 1a9b32858efd3..4ad3167383981 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
@@ -54,18 +54,6 @@ extern "C" {
 
 #define static_assert(a, b) do { } while (0)
 
-#define GROUP_MAX_EPS 1e-15f
-
-// QK = number of values after dequantization
-// QK_K = super-block size
-#define QK_K 256
-#define K_SCALE_SIZE 12
-
-#define GGML_COMPUTE_FP16_TO_FP32(x)    ggml_compute_fp16_to_fp32(x)
-#define GGML_COMPUTE_FP32_TO_FP16(x)    ggml_compute_fp32_to_fp16(x)
-#define GGML_FP32_TO_FP16(x)            GGML_COMPUTE_FP32_TO_FP16(x)
-#define GGML_FP16_TO_FP32(x)            ggml_lookup_fp16_to_fp32(x)
-
 //NPU performance will be slower when enable GGMLHEXAGON_DEBUG
 #ifdef NDEBUG
 #define GGMLHEXAGON_DEBUG                                   0
@@ -120,261 +108,29 @@ extern "C" {
 enum ggmlhexagon_log_level {
     GGMLHEXAGON_LOG_LEVEL_NONE  = 0,
     GGMLHEXAGON_LOG_LEVEL_DEBUG = 1,
-    GGMLHEXAGON_LOG_LEVEL_INFO  = 2,
-    GGMLHEXAGON_LOG_LEVEL_WARN  = 3,
-    GGMLHEXAGON_LOG_LEVEL_ERROR = 4,
-    GGMLHEXAGON_LOG_LEVEL_CONT  = 5,
 };
 
 enum ggml_type {
     GGML_TYPE_F32     = 0,
-    GGML_TYPE_F16     = 1,
-    GGML_TYPE_Q4_0    = 2,
-    GGML_TYPE_Q4_1    = 3,
-    // GGML_TYPE_Q4_2 = 4, support has been removed
-    // GGML_TYPE_Q4_3 = 5, support has been removed
-    GGML_TYPE_Q5_0    = 6,
-    GGML_TYPE_Q5_1    = 7,
-    GGML_TYPE_Q8_0    = 8,
-    GGML_TYPE_Q8_1    = 9,
-    GGML_TYPE_Q2_K    = 10,
-    GGML_TYPE_Q3_K    = 11,
-    GGML_TYPE_Q4_K    = 12,
-    GGML_TYPE_Q5_K    = 13,
-    GGML_TYPE_Q6_K    = 14,
-    GGML_TYPE_Q8_K    = 15,
-    GGML_TYPE_IQ2_XXS = 16,
-    GGML_TYPE_IQ2_XS  = 17,
-    GGML_TYPE_IQ3_XXS = 18,
-    GGML_TYPE_IQ1_S   = 19,
-    GGML_TYPE_IQ4_NL  = 20,
-    GGML_TYPE_IQ3_S   = 21,
-    GGML_TYPE_IQ2_S   = 22,
-    GGML_TYPE_IQ4_XS  = 23,
-    GGML_TYPE_I8      = 24,
-    GGML_TYPE_I16     = 25,
-    GGML_TYPE_I32     = 26,
-    GGML_TYPE_I64     = 27,
-    GGML_TYPE_F64     = 28,
-    GGML_TYPE_IQ1_M   = 29,
-    GGML_TYPE_BF16    = 30,
-    // GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
-    // GGML_TYPE_Q4_0_4_8 = 32,
-    // GGML_TYPE_Q4_0_8_8 = 33,
-    GGML_TYPE_TQ1_0   = 34,
-    GGML_TYPE_TQ2_0   = 35,
-    // GGML_TYPE_IQ4_NL_4_4 = 36,
-    // GGML_TYPE_IQ4_NL_4_8 = 37,
-    // GGML_TYPE_IQ4_NL_8_8 = 38,
-    GGML_TYPE_COUNT   = 39,
 };
 
 typedef double      ggml_float;
-typedef uint16_t    ggml_fp16_t;
-typedef uint16_t    ggml_half;
-typedef uint32_t    ggml_half2;
-typedef void        (*ggml_vec_dot_t)  (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
-                                        const void * GGML_RESTRICT y, size_t by, int nrc);
-typedef void        (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
-
-typedef void        (*ggml_to_float_t)  (const void  * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
-typedef void        (*ggml_from_float_t)(const float * GGML_RESTRICT x, void  * GGML_RESTRICT y, int64_t k);
-
-struct ggml_compute_params {
-    // ith = thread index, nth = number of threads
-    int ith, nth;
-
-    // work buffer for all threads
-    size_t wsize;
-    void * wdata;
-};
-
-#define QK4_0 32
-typedef struct {
-    ggml_half d;           // delta
-    uint8_t qs[QK4_0 / 2]; // nibbles / quants
-} block_q4_0;
-
-#define QK4_1 32
-typedef struct {
-    union {
-        struct {
-            ggml_half d; // delta
-            ggml_half m; // min
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 dm;
-    } GGML_COMMON_AGGR_U;
-    uint8_t qs[QK4_1 / 2]; // nibbles / quants
-} block_q4_1;
-
-#define QK5_0 32
-typedef struct {
-    ggml_half d;           // delta
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_0 / 2]; // nibbles / quants
-} block_q5_0;
-
-#define QK5_1 32
-typedef struct {
-    union {
-        struct {
-            ggml_half d; // delta
-            ggml_half m; // min
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 dm;
-    } GGML_COMMON_AGGR_U;
-    uint8_t qh[4];         // 5-th bit of quants
-    uint8_t qs[QK5_1 / 2]; // nibbles / quants
-} block_q5_1;
-
-#define QK8_0 32
-typedef struct {
-    ggml_half d;       // delta
-    int8_t  qs[QK8_0]; // quants
-} block_q8_0;
-
-#define QK8_1 32
-typedef struct {
-    union {
-        struct {
-            ggml_half d; // delta
-            ggml_half s; // d * sum(qs[i])
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 ds;
-    } GGML_COMMON_AGGR_U;
-    int8_t qs[QK8_1]; // quants
-} block_q8_1;
-
-// 2-bit quantization
-// weight is represented as x = a * q + b
-// 16 blocks of 16 elements each
-// Effectively 2.625 bits per weight
-typedef struct {
-    uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
-    uint8_t qs[QK_K/4];      // quants
-    union {
-        struct {
-            ggml_half d;    // super-block scale for quantized scales
-            ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 dm;
-    } GGML_COMMON_AGGR_U;
-} block_q2_K;
-
-// 3-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 3.4375 bits per weight
-typedef struct {
-    uint8_t hmask[QK_K/8]; // quants - high bit
-    uint8_t qs[QK_K/4];    // quants - low 2 bits
-    uint8_t scales[12];    // scales, quantized with 6 bits
-    ggml_half d;           // super-block scale
-} block_q3_K;
-
-// 4-bit quantization
-// 8 blocks of 32 elements each
-// weight is represented as x = a * q + b
-// Effectively 4.5 bits per weight
-typedef struct {
-    union {
-        struct {
-            ggml_half d;    // super-block scale for quantized scales
-            ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 dm;
-    } GGML_COMMON_AGGR_U;
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qs[QK_K/2];           // 4--bit quants
-} block_q4_K;
-
-// 5-bit quantization
-// 8 blocks of 32 elements each
-// weight is represented as x = a * q + b
-// Effectively 5.5 bits per weight
-typedef struct {
-    union {
-        struct {
-            ggml_half d;    // super-block scale for quantized scales
-            ggml_half dmin; // super-block scale for quantized mins
-        } GGML_COMMON_AGGR_S;
-        ggml_half2 dm;
-    } GGML_COMMON_AGGR_U;
-    uint8_t scales[K_SCALE_SIZE]; // scales and mins, quantized with 6 bits
-    uint8_t qh[QK_K/8];           // quants, high bit
-    uint8_t qs[QK_K/2];           // quants, low 4 bits
-} block_q5_K;
-
-// 6-bit quantization
-// weight is represented as x = a * q
-// 16 blocks of 16 elements each
-// Effectively 6.5625 bits per weight
-typedef struct {
-    uint8_t ql[QK_K/2];      // quants, lower 4 bits
-    uint8_t qh[QK_K/4];      // quants, upper 2 bits
-    int8_t  scales[QK_K/16]; // scales, quantized with 8 bits
-    ggml_half d;             // super-block scale
-} block_q6_K;
-
-typedef struct {
-    float   d;              // delta
-    int8_t  qs[QK_K];       // quants
-    int16_t bsums[QK_K/16]; // sum of quants in groups of 16
-} block_q8_K;
 
-struct ggml_type_traits {
-    const char             * type_name;
-    int64_t                  blck_size;
-    int64_t                  blck_size_interleave; // interleave elements in blocks
-    size_t                   type_size;
-    bool                     is_quantized;
-    ggml_to_float_t          to_float;
-    ggml_from_float_t        from_float_ref;
-};
-
-struct ggml_type_traits_cpu {
-    ggml_from_float_t        from_float;
-    ggml_vec_dot_t           vec_dot;
-    enum ggml_type           vec_dot_type;
-    int64_t                  nrows; // number of rows to process simultaneously
-};
-
-void ggml_time_init(void);
 int64_t ggml_time_ms(void);
 int64_t ggml_time_us(void);
 
-size_t ggml_type_size(enum ggml_type type);
-int64_t ggml_blck_size(enum ggml_type type);
-
-inline float ggml_lookup_fp16_to_fp32(ggml_fp16_t f);
-inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f);
-
-int64_t ggml_nrows(const struct ggml_tensor * tensor);
-bool ggml_is_transposed(const struct ggml_tensor * tensor);
-
-bool ggml_is_empty(const struct ggml_tensor * tensor);
 size_t ggml_nbytes(const struct ggml_tensor * tensor);
-
-bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
-bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
-
+int64_t ggml_nrows(const struct ggml_tensor * tensor);
+bool ggml_is_contiguous(const struct ggml_tensor * tensor);
 void ggml_abort(const char * file, int line, const char * fmt, ...);
 bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
-
-size_t ggml_row_size(enum ggml_type type, int64_t ne);
-int64_t ggml_nelements(const struct ggml_tensor * tensor);
-bool ggml_is_contiguous(const struct ggml_tensor * tensor);
+bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
 
 void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor);
 void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data);
 void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...);
 
 int ggmlop_get_thread_counts(void);
-int ggml_get_params_size(void);
-char * ggml_get_params_data(void);
-struct ggml_compute_params * ggmlop_get_params(void);
-
-extern struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT];
 
 #ifdef  __cplusplus
 }
diff --git a/ggml/src/ggml-hexagon/kernels/mulmat.c b/ggml/src/ggml-hexagon/kernels/mulmat.c
index 273a3c3dc65b0..5d2be891e4f28 100644
--- a/ggml/src/ggml-hexagon/kernels/mulmat.c
+++ b/ggml/src/ggml-hexagon/kernels/mulmat.c
@@ -1,16 +1,67 @@
 #include "ggml-dsp.h"
 
-static void ggml_compute_forward_mul_mat_one_chunk(
-        const struct ggml_compute_params * params,
-        const ggml_tensor * src0,
-        const ggml_tensor * src1,
-        struct ggml_tensor * dst,
-        const enum ggml_type type,
-        const int32_t num_rows_per_vec_dot,
-        const int32_t ir0_start,
-        const int32_t ir0_end,
-        const int32_t ir1_start,
-        const int32_t ir1_end) {
+// 128 byte vectors
+#define VSIZE_BYTES 128
+#define VSIZE_WORDS VSIZE_BYTES/4
+
+union ui32f { int32_t i; float f; };
+
+// create a vector of floats from a float
+static __attribute__((always_inline)) HVX_Vector create_sfv_from_sf(float value) {
+    union ui32f cvt;
+    cvt.f = value;
+    HVX_Vector tmp = Q6_V_vsplat_R(cvt.i);
+    return tmp;
+}
+
+// create a vector of qf32's from a float
+static __attribute__((always_inline)) HVX_Vector create_qf32v_from_sf(float value) {
+    HVX_Vector tmp = Q6_Vqf32_vadd_Vqf32Vsf(Q6_V_vsplat_R(0), create_sfv_from_sf(value));
+    return tmp;
+}
+
+// convert qf32 vector to float vector
+static __attribute__((always_inline)) HVX_Vector convert_qf32v_to_fltv(HVX_Vector vect) {
+    HVX_Vector tmp = Q6_Vsf_equals_Vqf32(vect);
+    return tmp;
+}
+
+// get lowest float from a vector of floats
+static __attribute__((always_inline)) float get_flt0_from_fltv(HVX_Vector vect) {
+    union ui32f cvt;
+    cvt.i = vect[0];
+    return cvt.f;
+}
+
+// get lowest float from a vector of qf32's
+static __attribute__((always_inline)) float get_flt0_from_qf32v(HVX_Vector vect) {
+    union ui32f cvt;
+    HVX_Vector tmp = convert_qf32v_to_fltv(vect);
+    cvt.i = tmp[0];
+    return cvt.f;
+}
+
+static void vec_dot_f32(int n, float *GGML_RESTRICT s, size_t bs, const float *GGML_RESTRICT x,
+                    size_t bx, const float *GGML_RESTRICT y, size_t by, int nrc) {
+    assert(nrc == 1);
+    UNUSED(nrc);
+    UNUSED(bx);
+    UNUSED(by);
+    UNUSED(bs);
+    // scalar
+    ggml_float sumf = 0.0;
+    for (int i = 0; i < n; ++i) {
+        sumf += (ggml_float) (x[i] * y[i]);
+    }
+    *s = sumf;
+}
+
+static void ggml_compute_forward_mul_mat_one_chunk(const ggml_tensor *src0, const ggml_tensor *src1,
+                                                   struct ggml_tensor *dst,
+                                                   const enum ggml_type type,
+                                                   const int32_t num_rows_per_vec_dot,
+                                                   const int32_t ir0_start, const int32_t ir0_end,
+                                                   const int32_t ir1_start, const int32_t ir1_end) {
     ggmlhexagon_dump_tensor(src0, 0);
     ggmlhexagon_dump_tensor(src1, 0);
     ggmlhexagon_dump_tensor(dst, 0);
@@ -20,8 +71,8 @@ static void ggml_compute_forward_mul_mat_one_chunk(
     dst->ne[2] = src1->ne[2];
     dst->ne[3] = src1->ne[3];
 
-    dst->nb[0] = ggml_type_size(src1->type);
-    dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type));
+    dst->nb[0] = 4;
+    dst->nb[1] = dst->nb[0] * dst->ne[0];
     dst->nb[2] = dst->nb[1] * dst->ne[1];
     dst->nb[3] = dst->nb[2] * dst->ne[2];
     ggmlhexagon_dump_tensor(dst, 0);
@@ -30,9 +81,6 @@ static void ggml_compute_forward_mul_mat_one_chunk(
 
     const bool src1_cont = ggml_is_contiguous(src1);
 
-    ggml_vec_dot_t const vec_dot      = type_traits_cpu[type].vec_dot;
-    enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
-
     // broadcast factors
     const int32_t r2 = ne12 / ne02;
     const int32_t r3 = ne13 / ne03;
@@ -41,8 +89,8 @@ static void ggml_compute_forward_mul_mat_one_chunk(
         return;
     }
 
-    const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;
-    const size_t row_size = ggml_row_size(vec_dot_type, ne10);
+    const void * wdata = src1->data;
+    const size_t row_size = 4* ne10;
 
     assert(ne12 % ne02 == 0);
     assert(ne13 % ne03 == 0);
@@ -51,7 +99,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
     const int32_t blck_0 = 16;
     const int32_t blck_1 = 16;
 
-    const size_t src1_col_stride = src1_cont || src1->type != vec_dot_type ? row_size : nb11;
+    const size_t src1_col_stride = src1_cont || nb11;
 
     // attempt to reduce false-sharing (does not seem to make a difference)
     // 16 * 2, accounting for mmla kernels
@@ -77,19 +125,14 @@ static void ggml_compute_forward_mul_mat_one_chunk(
                 // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides
                 //       if it is, then we have either copied the data to params->wdata and made it contiguous or we are using
                 //       the original src1 data pointer, so we should index using the indices directly
-                // TODO: this is a bit of a hack, we should probably have a better way to handle this
                 const char * src1_col = (const char*)wdata +
-                                        (src1_cont || src1->type != vec_dot_type
+                                        (src1_cont
                                          ? (i11 + i12 * ne11 + i13 * ne12 * ne11) * row_size
                                          : (i11 * nb11 + i12 * nb12 + i13 * nb13));
                 float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
 
-                //for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {
-                //    vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);
-                //}
-
                 for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
-                    vec_dot(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
+                    vec_dot_f32(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
                 }
 
                 for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {
@@ -100,7 +143,7 @@ static void ggml_compute_forward_mul_mat_one_chunk(
     }
 }
 
-//FIXME: only support fp32 mulmat on cDSP
+//TODO: only support fp32 mulmat on cDSP
 static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
     ggmlhexagon_dump_tensor(src0, 0);
@@ -112,19 +155,15 @@ static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor *
     dst->ne[2] = src1->ne[2];
     dst->ne[3] = src1->ne[3];
 
-    dst->nb[0] = ggml_type_size(src1->type);
-    dst->nb[1] = dst->nb[0] * (dst->ne[0] / ggml_blck_size(src1->type));
+    dst->nb[0] = 4;
+    dst->nb[1] = dst->nb[0] * dst->ne[0];
     dst->nb[2] = dst->nb[1] * dst->ne[1];
     dst->nb[3] = dst->nb[2] * dst->ne[2];
     ggmlhexagon_dump_tensor(dst, 0);
 
     GGML_TENSOR_BINARY_OP_LOCALS
 
-    enum ggml_type           const vec_dot_type         = type_traits_cpu[src0->type].vec_dot_type;
-    ggml_from_float_t        const from_float           = type_traits_cpu[vec_dot_type].from_float;
-    int32_t                  const vec_dot_num_rows     = type_traits_cpu[src0->type].nrows;
-    const int ith = 0;
-    const int nth = 1;
+    int32_t  const vec_dot_num_rows     = 1;
 
     GGML_ASSERT(ne0 == ne01);
     GGML_ASSERT(ne1 == ne11);
@@ -132,8 +171,8 @@ static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor *
     GGML_ASSERT(ne3 == ne13);
 
     // we don't support permuted src0 or src1
-    GGML_ASSERT(nb00 == ggml_type_size(src0->type));
-    GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+    GGML_ASSERT(nb00 == 4);
+    GGML_ASSERT(nb10 == 4);
 
     // dst cannot be transposed or permuted
     GGML_ASSERT(nb0 == sizeof(float));
@@ -166,36 +205,6 @@ static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor *
     }
 #endif
 
-    if (src1->type != vec_dot_type) {
-        size_t wsize = ggml_row_size(vec_dot_type, ggml_nelements(src1));
-        GGML_ASSERT(wsize < ggml_get_params_size());
-    }
-
-    if (src1->type != vec_dot_type) {
-        char * wdata = ggml_get_params_data();
-
-        const size_t nbw0 = ggml_type_size(vec_dot_type);
-        const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);
-        const size_t nbw2 = nbw1*ne11;
-        const size_t nbw3 = nbw2*ne12;
-
-        assert(ggml_get_params_size() >= ne13*nbw3);
-        GGML_ASSERT(src1->type == GGML_TYPE_F32);
-
-        for (int64_t i13 = 0; i13 < ne13; ++i13) {
-            for (int64_t i12 = 0; i12 < ne12; ++i12) {
-                for (int64_t i11 = 0; i11 < ne11; ++i11) {
-                    size_t bs = ggml_blck_size(vec_dot_type);
-                    int64_t ne10_block_start = (ith * ne10/bs) / nth;
-                    int64_t ne10_block_end   = ((ith + 1) * ne10/bs) / nth;
-                    from_float((float *)((char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11 + ne10_block_start*bs*nb10),
-                               (void *)               (wdata + i13*nbw3 + i12*nbw2 + i11*nbw1 + ne10_block_start*nbw0),
-                               (ne10_block_end - ne10_block_start) * bs);
-                }
-            }
-        }
-    }
-
     // This is the size of the first dimension of the result, so we can iterate that way. (see the ASSERT above, these are the same numbers)
     const int32_t nr0 = ne0;
 
@@ -250,7 +259,8 @@ static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor *
         if ((nr0 % 2 != 0) || (ne11 % 2 != 0) || ((ir0_end - ir0_start) % 2 != 0) || ((ir1_end - ir1_start) % 2 != 0)) {
             num_rows_per_vec_dot = 1;
         }
-        ggml_compute_forward_mul_mat_one_chunk(ggmlop_get_params(), src0, src1, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
+        ggml_compute_forward_mul_mat_one_chunk(src0, src1, dst, src0->type, num_rows_per_vec_dot,
+                                               ir0_start, ir0_end, ir1_start, ir1_end);
 
         if (1 >= nchunk0 * nchunk1) {
             break;
@@ -262,6 +272,7 @@ static int ggmlop_dsp_mulmat_singlethread(remote_handle64 h, const ggml_tensor *
     return 0;
 }
 
+//TODO:multithreading mulmat
 static int ggmlop_dsp_mulmat_multithread(remote_handle64 h, const struct dsptensor * src0, const struct dsptensor * src1, dsptensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
     GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );

From d4afea4637b64e942c9386bc14f2e8b5976065ec Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Wed, 16 Apr 2025 22:00:30 +0800
Subject: [PATCH 192/200] ggml-dsp:refine ggmlhexagon_dsp_add_f32

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp |  2 +-
 ggml/src/ggml-hexagon/kernels/add.c    | 13 +++++--------
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 3bcdbda72f512..b1284e3f18879 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -5584,7 +5584,7 @@ static bool ggmlhexagon_can_handle_op_through_qnn(ggml_backend_dev_t dev, const
 
     struct ggml_tensor * src0 = op_tensor->src[0];
     struct ggml_tensor * src1 = op_tensor->src[1];
-    const int64_t ne00        = src0->ne[0];;
+    const int64_t ne00        = src0->ne[0];
     const int src0_rank       = ggml_n_dims(src0);
     int src1_rank             = 0;
     if (nullptr != src1) {
diff --git a/ggml/src/ggml-hexagon/kernels/add.c b/ggml/src/ggml-hexagon/kernels/add.c
index 9d64e21d3918e..3502928697681 100644
--- a/ggml/src/ggml-hexagon/kernels/add.c
+++ b/ggml/src/ggml-hexagon/kernels/add.c
@@ -4,18 +4,12 @@ inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float
     HVX_Vector * va;
     HVX_Vector * vb;
     HVX_Vector * vc;
+    HVX_Vector qf32;
     const int FLOATS_PER_VECTOR = 128 / sizeof(float);
     const int block  = n / FLOATS_PER_VECTOR;
     const int left   = n % FLOATS_PER_VECTOR;
     const int blocks = block * FLOATS_PER_VECTOR;
 
-    if (0 == block) {
-        for (size_t i = 0; i < n; ++i)
-            z[i] = x[i] + y[i];
-
-        return;
-    }
-
     if ((((uintptr_t)z | (uintptr_t)x | (uintptr_t)y) % ALIGN_128_BYTE) != 0) {
         GGMLHEXAGON_LOG_DEBUG("memaddress mismatch alignment 128 bytes z:%p x:%p y:%p", z, x, y);
         for (size_t i = 0; i < n; ++i)
@@ -28,7 +22,10 @@ inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float
     vb = (HVX_Vector *)y;
     vc = (HVX_Vector *)z;
     for (size_t i = 0; i < block; ++i) {
-        *vc++ = Q6_Vsf_vadd_VsfVsf(*va++, *vb++);
+        //*vc++ = Q6_Vsf_vadd_VsfVsf(*va++, *vb++);
+        qf32 = Q6_Vqf32_vadd_VsfVsf(*va++, *vb++);
+        *vc = Q6_Vsf_equals_Vqf32(qf32);
+        vc++;
     }
 
     if (left > 0) {

From 2862e2700c195bb784a23e62c86d18b93ad51ce9 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 17 Apr 2025 09:34:39 +0800
Subject: [PATCH 193/200] ggml-dsp: refine logic of thread_counts

---
 ggml/src/ggml-hexagon/kernels/ggml-dsp.c | 12 +++++++---
 ggml/src/ggml-hexagon/kernels/ggml-dsp.h | 30 ++++++++++++++++++++++--
 ggml/src/ggml-hexagon/kernels/mulmat.c   |  4 +++-
 3 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
index 025d6c99173e1..4f66995190d5c 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -198,7 +198,8 @@ int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
     GGMLHEXAGON_LOG_DEBUG("aheap.heap_base=0x%x, aheap.heap_limit=0x%x", aheap.heap_base, aheap.heap_limit);
     qurt_sysenv_max_hthreads_t mhwt;
     qurt_sysenv_get_max_hw_threads(&mhwt);
-    GGMLHEXAGON_LOG_DEBUG("max hardware threads=%d", mhwt.max_hthreads);
+    GGMLHEXAGON_LOG_DEBUG("max hardware threads counts=%d", mhwt.max_hthreads);
+    g_thread_counts = mhwt.max_hthreads;
 
     return 0;
 }
@@ -211,13 +212,18 @@ int ggmlop_dsp_close(remote_handle64 handle) {
 }
 
 AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled, int32 thread_counts) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
     HAP_power_request_t request;
     memset(&request, 0, sizeof(HAP_power_request_t));
     request.type = HAP_power_set_apptype;
     request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
 
-    g_thread_counts = thread_counts;
+    GGMLHEXAGON_LOG_DEBUG("user specified thread_counts %d", thread_counts);
+    if (thread_counts > 1)
+        g_thread_counts = (thread_counts > g_thread_counts) ? g_thread_counts : thread_counts;
+    else
+        g_thread_counts = 1;
+    GGMLHEXAGON_LOG_DEBUG("real thread_counts %d", g_thread_counts);
 
     void * ggmop_ctx = (void*)(handle);
     int retval = HAP_power_set(ggmop_ctx, &request);
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
index 4ad3167383981..1e6c4b4749d82 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
@@ -50,9 +50,35 @@ extern "C" {
 #define GGML_MEM_ALIGN      16
 #endif
 
-#define GGML_RESTRICT
+#ifdef __cplusplus
+// restrict not standard in C++
+#    if defined(__GNUC__)
+#        define GGML_RESTRICT       __restrict__
+#    elif defined(__clang__)
+#        define GGML_RESTRICT       __restrict
+#    elif defined(_MSC_VER)
+#        define GGML_RESTRICT       __restrict
+#    else
+#        define GGML_RESTRICT
+#    endif
+#else
+#    if defined (_MSC_VER) && (__STDC_VERSION__ < 201112L)
+#        define GGML_RESTRICT       __restrict
+#    else
+#        define GGML_RESTRICT       restrict
+#    endif
+#endif
+
+#ifndef __cplusplus
+#ifndef static_assert
+        #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
+            #define static_assert(cond, msg) _Static_assert(cond, msg)
+        #else
+            #define static_assert(cond, msg) struct global_scope_noop_trick
+        #endif
+#endif
+#endif // __cplusplus
 
-#define static_assert(a, b) do { } while (0)
 
 //NPU performance will be slower when enable GGMLHEXAGON_DEBUG
 #ifdef NDEBUG
diff --git a/ggml/src/ggml-hexagon/kernels/mulmat.c b/ggml/src/ggml-hexagon/kernels/mulmat.c
index 5d2be891e4f28..f7494c8eaacf4 100644
--- a/ggml/src/ggml-hexagon/kernels/mulmat.c
+++ b/ggml/src/ggml-hexagon/kernels/mulmat.c
@@ -132,7 +132,9 @@ static void ggml_compute_forward_mul_mat_one_chunk(const ggml_tensor *src0, cons
                 float * dst_col = (float*)((char*)dst->data + (i1 * nb1 + i2 * nb2 + i3 * nb3));
 
                 for (int32_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ir0 += num_rows_per_vec_dot) {
-                    vec_dot_f32(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0), src0_row + ir0 * nb01, (num_rows_per_vec_dot > 1 ? nb01 : 0), src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
+                    vec_dot_f32(ne00, &tmp[ir0 - iir0], (num_rows_per_vec_dot > 1 ? 16 : 0),
+                                (float*)(src0_row + ir0 * nb01), (num_rows_per_vec_dot > 1 ? nb01 : 0),
+                                (float*)src1_col, (num_rows_per_vec_dot > 1 ? src1_col_stride : 0), num_rows_per_vec_dot);
                 }
 
                 for (int cn = 0; cn < num_rows_per_vec_dot; ++cn) {

From c36bd932a9ff9db126ef4ee2e2c7bce01d470dea Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 17 Apr 2025 16:31:31 +0800
Subject: [PATCH 194/200] ggml-hexagon: release v1.06 and ready for code review

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp   |  25 +++-
 ggml/src/ggml-hexagon/kernels/Makefile   |   2 +-
 ggml/src/ggml-hexagon/kernels/entry.c    | 115 ++++++++++++++++++
 ggml/src/ggml-hexagon/kernels/ggml-dsp.c | 148 +++++------------------
 scripts/ggml-hexagon.cfg                 |   4 +-
 5 files changed, 167 insertions(+), 127 deletions(-)
 create mode 100644 ggml/src/ggml-hexagon/kernels/entry.c

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index b1284e3f18879..3257813c7a6b0 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -383,8 +383,8 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
 #elif defined(_WIN32)
         .qnn_runtimelib_path    = "C:\\",
 #endif
-        .ggml_hexagon_version   = {"1.05"},
-        .ggml_dsp_version       = {"0.62"},
+        .ggml_hexagon_version   = {"1.06"},
+        .ggml_dsp_version       = {"0.63"},
 };
 
 //file:///opt/qcom/aistack/qairt/2.31.0.250130/docs/QNN/general/overview.html#tbl-supported-snapdragon-devices
@@ -1417,6 +1417,13 @@ class hexagon_appcfg {
         section = cur_section;
         trim(key);
         trim(value);
+
+        //"1.00" -> 1.00
+        if (value.front() == '"' && value.back() == '"') {
+            value.erase(0, 1); // erase the first character "
+            value.erase(value.size() - 1); // erase the last character "
+        }
+
         return true;
     }
 
@@ -1829,8 +1836,10 @@ static void ggmlhexagon_load_cfg() {
         GGMLHEXAGON_LOG_INFO("%s", tmposs.str().c_str());
     });
     std::string precision_mode;
-    std::string ggml_hexagon_version;
-    hexagoncfg_instance.get_stringvalue("general", "version", ggml_hexagon_version, "1.00");
+    std::string version; //version of ggml-hexagon.cpp
+    std::string ggmldsp_version; //version of ggml-dsp.c
+    hexagoncfg_instance.get_stringvalue("general", "version", version, "1.00");
+    hexagoncfg_instance.get_stringvalue("general", "ggmldsp_version", ggmldsp_version, "0.62");
     hexagoncfg_instance.get_intvalue("general", "enable_perf", g_hexagon_appcfg.enable_perf, 1);
     hexagoncfg_instance.get_intvalue("general", "print_tensors_info", g_hexagon_appcfg.print_tensors_info, 0);
     hexagoncfg_instance.get_intvalue("general", "dump_op_info", g_hexagon_appcfg.dump_op_info, 0);
@@ -1854,7 +1863,9 @@ static void ggmlhexagon_load_cfg() {
 
     GGMLHEXAGON_LOG_INFO("internal ggml_hexagon_version=%s", g_hexagon_appcfg.ggml_hexagon_version);
     GGMLHEXAGON_LOG_INFO("internal ggml_dsp_version=%s", g_hexagon_appcfg.ggml_dsp_version);
-    GGMLHEXAGON_LOG_INFO("external ggml_hexagon_version=%s", ggml_hexagon_version.c_str());
+    GGMLHEXAGON_LOG_INFO("external ggml_hexagon_version=%s", version.c_str());
+    GGMLHEXAGON_LOG_INFO("external ggml_dsp_version=%s", ggmldsp_version.c_str());
+    memcpy(g_hexagon_appcfg.ggml_dsp_version, ggmldsp_version.c_str(), strlen(ggmldsp_version.c_str()));
     GGMLHEXAGON_LOG_INFO("hwaccel_approach=%d(%s)", g_hexagon_appcfg.hwaccel_approach,
                          ggmlhexagon_get_hwaccel_approach_name(g_hexagon_appcfg.hwaccel_approach));
     GGMLHEXAGON_LOG_INFO("hexagon_backend=%d(%s)", g_hexagon_appcfg.hexagon_backend,
@@ -5445,6 +5456,7 @@ static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_
     // between ARM-AP and cDSP. the mechanism in qidl/FastRPC is exactly similar to mechanism in TEE.
     // try to find a better/efficient approach to exchange necessary data between ARM-AP side and cDSP side.
     // manually modifying the important data structure ggml_tensor in ggml.h is not make-sense and not acceptable.
+    std::chrono::high_resolution_clock::time_point start_time = std::chrono::high_resolution_clock::now();
     dsptensor_0.data        = src0->data;
     dsptensor_0.data_len    = ggml_nbytes(src0);
     dsptensor_0.type        = src0->type;
@@ -5491,6 +5503,9 @@ static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_
     dsptensor_2.nb[3] = dst->nb[3];
 
     memcpy(dsptensor_2.op_params, dst->op_params, GGML_MAX_OP_PARAMS / sizeof(int32_t));
+    std::chrono::high_resolution_clock::time_point end_time = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<size_t, std::nano> duration = end_time - start_time;
+    GGMLHEXAGON_LOG_VERBOSE("pack duration %llu ns", duration.count());
 
     hexagon_error = op_func(ctx->ggmlop_handle, &dsptensor_0, &dsptensor_1, &dsptensor_2);
     if (AEE_SUCCESS != hexagon_error) {
diff --git a/ggml/src/ggml-hexagon/kernels/Makefile b/ggml/src/ggml-hexagon/kernels/Makefile
index b513a3f118d28..0e6b3fa2e4df6 100755
--- a/ggml/src/ggml-hexagon/kernels/Makefile
+++ b/ggml/src/ggml-hexagon/kernels/Makefile
@@ -21,7 +21,7 @@ CFLAGS=-m${HTP_ARCH_VERSION} -c -Ofast -Wall -Wstrict-prototypes -fno-zero-initi
 LDFLAGS=-m${HTP_ARCH_VERSION} -Wl,--defsym=ISDB_TRUSTED_FLAG=2 -Wl,--defsym=ISDB_SECURE_FLAG=2 -Wl,--no-threads -fpic -shared -Wl,-Bsymbolic -Wl,--wrap=malloc -Wl,--wrap=calloc -Wl,--wrap=free -Wl,--wrap=realloc -Wl,--wrap=memalign -lc -Wl,-soname=${TARGET}
 
 #SRCS = $(wildcard *.c)
-SRCS = ggml-dsp.c skel.c add.c  mulmat.c
+SRCS = ggml-dsp.c skel.c entry.c add.c  mulmat.c
 OBJS = $(patsubst %.c, %.o, $(SRCS))
 
 ALL:$(OBJS)
diff --git a/ggml/src/ggml-hexagon/kernels/entry.c b/ggml/src/ggml-hexagon/kernels/entry.c
new file mode 100644
index 0000000000000..ea38beea673c0
--- /dev/null
+++ b/ggml/src/ggml-hexagon/kernels/entry.c
@@ -0,0 +1,115 @@
+#include "ggml-dsp.h"
+
+static int32 g_thread_counts = 1;
+
+int ggmlop_dsp_open(const char * uri, remote_handle64 * handle) {
+    void * tptr = NULL;
+    GGMLHEXAGON_LOG_DEBUG("uri %s", uri);
+    tptr = (void *)malloc(1);
+    GGML_ASSERT(NULL != tptr);
+    *handle = (remote_handle64)tptr;
+
+    GGMLHEXAGON_LOG_DEBUG("api_version = 0x%x", qurt_api_version());
+    GGMLHEXAGON_LOG_DEBUG("hvx units = 0x%d", qurt_hvx_get_units());
+    qurt_arch_version_t  vers;
+    qurt_sysenv_get_arch_version(&vers);
+    GGMLHEXAGON_LOG_DEBUG("arch_version=0x%x", vers.arch_version);
+
+    qurt_sysenv_app_heap_t aheap;
+    qurt_sysenv_get_app_heap(&aheap);
+    GGMLHEXAGON_LOG_DEBUG("aheap.heap_base=0x%x, aheap.heap_limit=0x%x", aheap.heap_base, aheap.heap_limit);
+
+    qurt_sysenv_max_hthreads_t mhwt;
+    qurt_sysenv_get_max_hw_threads(&mhwt);
+    GGMLHEXAGON_LOG_DEBUG("max hardware threads counts=%d", mhwt.max_hthreads);
+    g_thread_counts = mhwt.max_hthreads;
+
+    return 0;
+}
+
+int ggmlop_dsp_close(remote_handle64 handle) {
+    if (handle)
+        free((void*)handle);
+
+    return 0;
+}
+
+AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled, int32 thread_counts) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
+    HAP_power_request_t request;
+    memset(&request, 0, sizeof(HAP_power_request_t));
+    request.type = HAP_power_set_apptype;
+    request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
+
+    GGMLHEXAGON_LOG_DEBUG("user specified thread_counts %d", thread_counts);
+    if (thread_counts > 1)
+        g_thread_counts = (thread_counts > g_thread_counts) ? g_thread_counts : thread_counts;
+    else
+        g_thread_counts = 1;
+    GGMLHEXAGON_LOG_DEBUG("real thread_counts %d", g_thread_counts);
+
+    void * ggmop_ctx = (void*)(handle);
+    int retval = HAP_power_set(ggmop_ctx, &request);
+    if (retval)  {
+        GGMLHEXAGON_LOG_DEBUG("failed first power vote");
+        return AEE_EFAILED;
+    }
+
+    //configure clocks & DCVS mode
+    memset(&request, 0, sizeof(HAP_power_request_t));
+    request.type = HAP_power_set_DCVS_v2;
+    request.dcvs_v2.dcvs_enable = TRUE;
+    request.dcvs_v2.dcvs_params.target_corner = (HAP_dcvs_voltage_corner_t)power_level;
+    if (dcvs_enabled) {
+        request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_DISABLE;
+        request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_DISABLE;
+    } else {
+        request.dcvs_v2.dcvs_params.min_corner = request.dcvs_v2.dcvs_params.target_corner;
+        request.dcvs_v2.dcvs_params.max_corner = request.dcvs_v2.dcvs_params.target_corner;
+    }
+    request.dcvs_v2.dcvs_option     = HAP_DCVS_V2_PERFORMANCE_MODE;
+    request.dcvs_v2.set_dcvs_params = TRUE;
+    request.dcvs_v2.set_latency     = TRUE;
+    request.dcvs_v2.latency         = latency;
+    retval = HAP_power_set(ggmop_ctx, &request);
+    if (retval) {
+        GGMLHEXAGON_LOG_DEBUG("failed to vote for performance mode");
+        return AEE_EFAILED;
+    }
+
+    memset(&request, 0, sizeof(HAP_power_request_t));
+    request.type = HAP_power_set_HVX;
+    request.hvx.power_up = TRUE;
+    retval = HAP_power_set(ggmop_ctx, &request);
+    if (retval) {
+        GGMLHEXAGON_LOG_DEBUG("failed to vote for HVX power");
+        return AEE_EFAILED;
+    }
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return AEE_SUCCESS;
+}
+
+// =================================================================================================
+//  implementation of ggml-hexagon kernel, it's better to put every hexagon-kernel to a single file
+// =================================================================================================
+int ggmlop_dsp_softmax(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+int ggmlop_dsp_rmsnorm(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+int ggmlop_dsp_pool2d(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return 0;
+}
+
+int ggmlop_get_thread_counts(void) {
+    return g_thread_counts;
+}
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
index 4f66995190d5c..4785cc508f986 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -1,10 +1,33 @@
+/*
+ * Copyright (c) 2025 The ggml authors
+ *
+ * Qualcomm Hexagon SDK and reference tech guides could be found at:
+ * https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
+ *
+ * this single-source-file or self-contained file is implementation of ggml-dsp:
+ *    - a customized tiny ggml running on Qualcomm Hexagon cDSP
+ *    - ported from original ggml
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
 #include "ggml-dsp.h"
 
-// =================================================================================================
-// tiny ggml-dsp, ported from original ggml
-// =================================================================================================
-static int32 g_thread_counts = 1;
-
 void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...) {
 #if !GGMLHEXAGON_DEBUG
     return;
@@ -30,7 +53,7 @@ void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor) {
     char tmpbuf[GGMLHEXAGON_LOGBUF_LEN];
     size_t buflen = 0;
     if (tensor->type == GGML_TYPE_F32) {
-        memset(tmpbuf, 0, GGMLHEXAGON_LOG_LEVEL_DEBUG);
+        memset(tmpbuf, 0, GGMLHEXAGON_LOGBUF_LEN);
         for (int h = 0; h < tensor->ne[3]; h++) {
             for (int i = 0; i < tensor->ne[2]; i++) {
                 for (int j = 0; j < tensor->ne[1]; j++) {
@@ -173,116 +196,3 @@ int64_t ggml_time_ms(void) {
 int64_t ggml_time_us(void) {
     return hexagon_perf_get_time_us();
 }
-
-int ggmlop_get_thread_counts(void) {
-    return g_thread_counts;
-}
-
-// =================================================================================================
-//  implementation of ggml-hexagon kernel skel function
-// =================================================================================================
-int ggmlop_dsp_open(const char*uri, remote_handle64* handle) {
-    void *tptr = NULL;
-    GGMLHEXAGON_LOG_DEBUG("uri %s", uri);
-    tptr = (void *)malloc(1);
-    *handle = (remote_handle64)tptr;
-    assert(*handle);
-
-    GGMLHEXAGON_LOG_DEBUG("api_version = 0x%x", qurt_api_version());
-    GGMLHEXAGON_LOG_DEBUG("hvx units = 0x%d", qurt_hvx_get_units());
-    qurt_arch_version_t  vers;
-    qurt_sysenv_get_arch_version(&vers);
-    GGMLHEXAGON_LOG_DEBUG("arch_version=0x%x", vers.arch_version);
-    qurt_sysenv_app_heap_t aheap;
-    qurt_sysenv_get_app_heap(&aheap);
-    GGMLHEXAGON_LOG_DEBUG("aheap.heap_base=0x%x, aheap.heap_limit=0x%x", aheap.heap_base, aheap.heap_limit);
-    qurt_sysenv_max_hthreads_t mhwt;
-    qurt_sysenv_get_max_hw_threads(&mhwt);
-    GGMLHEXAGON_LOG_DEBUG("max hardware threads counts=%d", mhwt.max_hthreads);
-    g_thread_counts = mhwt.max_hthreads;
-
-    return 0;
-}
-
-int ggmlop_dsp_close(remote_handle64 handle) {
-    if (handle)
-        free((void*)handle);
-
-    return 0;
-}
-
-AEEResult ggmlop_dsp_setclocks(remote_handle64 handle, int32 power_level, int32 latency, int32 dcvs_enabled, int32 thread_counts) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
-    HAP_power_request_t request;
-    memset(&request, 0, sizeof(HAP_power_request_t));
-    request.type = HAP_power_set_apptype;
-    request.apptype = HAP_POWER_COMPUTE_CLIENT_CLASS;
-
-    GGMLHEXAGON_LOG_DEBUG("user specified thread_counts %d", thread_counts);
-    if (thread_counts > 1)
-        g_thread_counts = (thread_counts > g_thread_counts) ? g_thread_counts : thread_counts;
-    else
-        g_thread_counts = 1;
-    GGMLHEXAGON_LOG_DEBUG("real thread_counts %d", g_thread_counts);
-
-    void * ggmop_ctx = (void*)(handle);
-    int retval = HAP_power_set(ggmop_ctx, &request);
-    if (retval)  {
-        GGMLHEXAGON_LOG_DEBUG("failed first power vote");
-        return AEE_EFAILED;
-    }
-
-    //configure clocks & DCVS mode
-    memset(&request, 0, sizeof(HAP_power_request_t));
-    request.type = HAP_power_set_DCVS_v2;
-    request.dcvs_v2.dcvs_enable = TRUE;
-    request.dcvs_v2.dcvs_params.target_corner = (HAP_dcvs_voltage_corner_t)power_level;
-    if (dcvs_enabled) {
-        request.dcvs_v2.dcvs_params.min_corner = HAP_DCVS_VCORNER_DISABLE;
-        request.dcvs_v2.dcvs_params.max_corner = HAP_DCVS_VCORNER_DISABLE;
-    } else {
-        request.dcvs_v2.dcvs_params.min_corner = request.dcvs_v2.dcvs_params.target_corner;
-        request.dcvs_v2.dcvs_params.max_corner = request.dcvs_v2.dcvs_params.target_corner;
-    }
-    request.dcvs_v2.dcvs_option     = HAP_DCVS_V2_PERFORMANCE_MODE;
-    request.dcvs_v2.set_dcvs_params = TRUE;
-    request.dcvs_v2.set_latency     = TRUE;
-    request.dcvs_v2.latency         = latency;
-    retval = HAP_power_set(ggmop_ctx, &request);
-    if (retval) {
-        GGMLHEXAGON_LOG_DEBUG("failed to vote for performance mode");
-        return AEE_EFAILED;
-    }
-
-    memset(&request, 0, sizeof(HAP_power_request_t));
-    request.type = HAP_power_set_HVX;
-    request.hvx.power_up = TRUE;
-    retval = HAP_power_set(ggmop_ctx, &request);
-    if (retval) {
-        GGMLHEXAGON_LOG_DEBUG("failed to vote for HVX power");
-        return AEE_EFAILED;
-    }
-    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-    return AEE_SUCCESS;
-}
-
-// =================================================================================================
-//  implementation of ggml-hexagon kernel, it's better to put every hexagon-kernel to a single file
-// =================================================================================================
-int ggmlop_dsp_softmax(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
-    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-    return 0;
-}
-
-int ggmlop_dsp_rmsnorm(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
-    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-    return 0;
-}
-
-int ggmlop_dsp_pool2d(remote_handle64 h, const dsptensor * src0, const dsptensor * src1, dsptensor * dst) {
-    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
-    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
-    return 0;
-}
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index 67bacdd1a3c72..2ae65dcfec671 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -23,9 +23,9 @@
 #
 [general]
 #version of ggml-hexagon.cpp on ARM-AP side
-version = "1.05"
+version = "1.06"
 #version of ggml-dsp.c on cDSP side
-ggmldsp_version = "0.62"
+ggmldsp_version = "0.63"
 
 #0: HEXAGON_BACKEND_QNNCPU
 #1: HEXAGON_BACKEND_QNNGPU

From 4f70d23f57217838389a76cdbf4b2871ab430899 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Sat, 19 Apr 2025 10:02:47 +0800
Subject: [PATCH 195/200] ggml-dsp: make GGML_OP_ADD more faster on cDSP side

---
 ggml/src/ggml-hexagon/CMakeLists.txt     |  3 ++
 ggml/src/ggml-hexagon/ggml-hexagon.cpp   | 30 +++++++++++++-
 ggml/src/ggml-hexagon/kernels/add.c      | 53 +++++++++++++-----------
 ggml/src/ggml-hexagon/kernels/ggml-dsp.c |  9 ++++
 ggml/src/ggml-hexagon/kernels/ggml-dsp.h | 29 +++++++------
 5 files changed, 87 insertions(+), 37 deletions(-)

diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index c1393adb27b49..bb8c004c2f812 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -22,6 +22,9 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
     message("Debug mode:${DEBUG_FLAG}")
 else()
     set(DEBUG_FLAG "-DNDEBUG -Wall")
+#manually disable all foreground logs in ggml-hexagon/CMakeLists.txt to
+#make compare NPU performance through llama-bench more clear
+#set(DEBUG_FLAG "-DNDEBUG -Wall -DDISABLE_ALL_LOG")
     message("Release mode:${DEBUG_FLAG}")
 endif()
 
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 3257813c7a6b0..9afe6681ca3fa 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -141,8 +141,16 @@ struct ggml_backend_hexagon_context;
 
 #define GGMLHEXAGON_LOG_ERROR(...)                      ggmlhexagon_log_internal(GGML_LOG_LEVEL_ERROR, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #define GGMLHEXAGON_LOG_WARN(...)                       ggmlhexagon_log_internal(GGML_LOG_LEVEL_WARN , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+
+#if !defined (DISABLE_ALL_LOG)
 #define GGMLHEXAGON_LOG_INFO(...)                       ggmlhexagon_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #define GGMLHEXAGON_LOG_VERBOSE(...)                    ggmlhexagon_log_internal(GGML_LOG_LEVEL_CONT , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
+#else
+//manually disable all foreground logs in ggml-hexagon/CMakeLists.txt to
+//make compare NPU performance through llama-bench more clear
+#define GGMLHEXAGON_LOG_INFO(...)
+#define GGMLHEXAGON_LOG_VERBOSE(...)
+#endif
 
 #if GGMLHEXAGON_DEBUG
 #define GGMLHEXAGON_LOG_DEBUG(...)                      ggmlhexagon_log_internal(GGML_LOG_LEVEL_DEBUG, __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
@@ -6365,7 +6373,27 @@ struct ggml_backend_hexagon_reg_context {
 
 static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) {
     GGML_UNUSED(reg);
-    return "ggml-hexagon";
+    //return "ggml-hexagon";
+
+    //return accurate backend name rather than "ggml-hexagon" to
+    //make compare NPU performance through llama-bench more clear
+    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
+        GGML_ASSERT(HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend);
+        return "Hexagon-cDSP";
+    }
+
+    if (HWACCEL_QNN == g_hexagon_appcfg.hwaccel_approach) {
+        if (HEXAGON_BACKEND_QNNNPU == g_hexagon_appcfg.hexagon_backend)
+            return "QNN-NPU";
+
+        if (HEXAGON_BACKEND_QNNGPU == g_hexagon_appcfg.hexagon_backend)
+            return "QNN-GPU";
+
+        if (HEXAGON_BACKEND_QNNCPU == g_hexagon_appcfg.hexagon_backend)
+            return "QNN-CPU";
+    }
+
+    return "unknown";
 }
 
 static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg) {
diff --git a/ggml/src/ggml-hexagon/kernels/add.c b/ggml/src/ggml-hexagon/kernels/add.c
index 3502928697681..25a2d73e23536 100644
--- a/ggml/src/ggml-hexagon/kernels/add.c
+++ b/ggml/src/ggml-hexagon/kernels/add.c
@@ -1,14 +1,21 @@
 #include "ggml-dsp.h"
 
-inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float * x, const float * y) {
+static inline void l2fetch(const void * p, uint32_t stride,
+                           uint32_t width, uint32_t height,
+                           uint32_t dir) {
+    uint64_t control = HEXAGON_V64_CREATE_H(dir, stride, width, height);
+    __asm__ __volatile__ (" l2fetch(%0,%1) " : :"r"(p),"r"(control));
+}
+
+static inline void ggmlhexagon_dsp_add_f32(const int n, float * GGML_RESTRICT z, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y) {
     HVX_Vector * va;
     HVX_Vector * vb;
     HVX_Vector * vc;
     HVX_Vector qf32;
-    const int FLOATS_PER_VECTOR = 128 / sizeof(float);
-    const int block  = n / FLOATS_PER_VECTOR;
-    const int left   = n % FLOATS_PER_VECTOR;
-    const int blocks = block * FLOATS_PER_VECTOR;
+    const size_t FLOATS_PER_VECTOR = 128 / sizeof(float);
+    const size_t block  = n / FLOATS_PER_VECTOR;
+    const size_t left   = n % FLOATS_PER_VECTOR;
+    const size_t blocks = block * FLOATS_PER_VECTOR;
 
     if ((((uintptr_t)z | (uintptr_t)x | (uintptr_t)y) % ALIGN_128_BYTE) != 0) {
         GGMLHEXAGON_LOG_DEBUG("memaddress mismatch alignment 128 bytes z:%p x:%p y:%p", z, x, y);
@@ -21,11 +28,13 @@ inline static void ggmlhexagon_dsp_add_f32 (const int n, float * z, const float
     va = (HVX_Vector *)x;
     vb = (HVX_Vector *)y;
     vc = (HVX_Vector *)z;
+    //unroll is better but need more carefully check for various cases and I think DSP also don't like branch predication
     for (size_t i = 0; i < block; ++i) {
+        l2fetch(va + VLEN, VLEN, VLEN, 1, 0);
+        l2fetch(vb + VLEN, VLEN, VLEN, 1, 0);
         //*vc++ = Q6_Vsf_vadd_VsfVsf(*va++, *vb++);
         qf32 = Q6_Vqf32_vadd_VsfVsf(*va++, *vb++);
-        *vc = Q6_Vsf_equals_Vqf32(qf32);
-        vc++;
+        *vc++ = Q6_Vsf_equals_Vqf32(qf32);
     }
 
     if (left > 0) {
@@ -49,6 +58,17 @@ static void ggml_compute_forward_add_f32(
 
     GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
 
+    const int rank = ggml_n_dims(src0);
+    if (1 == rank) {
+        //element-wise addition with vector
+        const size_t len = src0->ne[0];
+        float * dst_ptr  = (float *) (dst->data);
+        float * src0_ptr = (float *) (src0->data);
+        float * src1_ptr = (float *) (src1->data);
+        ggmlhexagon_dsp_add_f32(len, dst_ptr, src0_ptr, src1_ptr);
+        return;
+    }
+
     const int ith = 0;
     const int nth = 1;
 
@@ -115,24 +135,9 @@ static void ggml_compute_forward_add_f32(
 }
 
 //FIXME: why failed with test-backend-ops when disable ion rpc mempool
-int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst)
-{
+int ggmlop_dsp_add(remote_handle64 h, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
     GGMLHEXAGON_LOG_DEBUG("enter %s\n", __func__);
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-        {
-            if (src1->type == GGML_TYPE_F32) {
-                ggml_compute_forward_add_f32(src0, src1, dst);
-            } else {
-                GGML_ABORT("fatal error");
-            }
-            break;
-        }
-        default:
-        {
-            GGML_ABORT("fatal error");
-        }
-    }
+    ggml_compute_forward_add_f32(src0, src1, dst);
     GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__);
     return 0;
 }
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
index 4785cc508f986..b64209971a0dc 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.c
@@ -178,6 +178,15 @@ bool ggml_is_contiguous(const struct ggml_tensor * tensor) {
     return ggml_is_contiguous_0(tensor);
 }
 
+int ggml_n_dims(const struct ggml_tensor * tensor) {
+    for (int i = GGML_MAX_DIMS - 1; i >= 1; --i) {
+        if (tensor->ne[i] > 1) {
+            return i + 1;
+        }
+    }
+    return 1;
+}
+
 void ggml_abort(const char * file, int line, const char * fmt, ...) {
     GGMLHEXAGON_LOG_DEBUG("enter ggml_abort");
     abort();
diff --git a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
index 1e6c4b4749d82..103b46b8ee7fc 100644
--- a/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
+++ b/ggml/src/ggml-hexagon/kernels/ggml-dsp.h
@@ -31,6 +31,8 @@ extern "C" {
 
 #define ALIGN_128_BYTE      128
 
+#define VLEN                128
+
 #define GGML_UNUSED(x)      (void)(x)
 
 #define UNUSED              GGML_UNUSED
@@ -50,6 +52,8 @@ extern "C" {
 #define GGML_MEM_ALIGN      16
 #endif
 
+#define GGML_API            extern
+
 #ifdef __cplusplus
 // restrict not standard in C++
 #    if defined(__GNUC__)
@@ -142,21 +146,22 @@ enum ggml_type {
 
 typedef double      ggml_float;
 
-int64_t ggml_time_ms(void);
-int64_t ggml_time_us(void);
+GGML_API int64_t ggml_time_ms(void);
+GGML_API int64_t ggml_time_us(void);
 
-size_t ggml_nbytes(const struct ggml_tensor * tensor);
-int64_t ggml_nrows(const struct ggml_tensor * tensor);
-bool ggml_is_contiguous(const struct ggml_tensor * tensor);
-void ggml_abort(const char * file, int line, const char * fmt, ...);
-bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
-bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+GGML_API size_t ggml_nbytes(const struct ggml_tensor * tensor);
+GGML_API int64_t ggml_nrows(const struct ggml_tensor * tensor);
+GGML_API int ggml_n_dims(const struct ggml_tensor * tensor);
+GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
+GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
+GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
+GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
 
-void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor);
-void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data);
-void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...);
+GGML_API void ggmlhexagon_dump_tensor_elements(const ggml_tensor * tensor);
+GGML_API void ggmlhexagon_dump_tensor(const ggml_tensor * tensor, int dump_tensor_data);
+GGML_API void ggmlhexagon_log_internal(int level, const char *file, const char *func, int line, const char *format, ...);
 
-int ggmlop_get_thread_counts(void);
+GGML_API int ggmlop_get_thread_counts(void);
 
 #ifdef  __cplusplus
 }

From 7b00b5164a2b68cf85dd909d110ddd2de2e0d362 Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Thu, 24 Apr 2025 10:11:40 +0800
Subject: [PATCH 196/200] ggml-hexagon: sync from project kantv(make
 ggml-hexagon backend can works in a standard Android APP)

---
 ggml/include/ggml-hexagon.h            |   8 +-
 ggml/src/ggml-hexagon/CMakeLists.txt   |   2 +-
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 184 ++++++++++++++++---------
 scripts/ggml-hexagon.cfg               |  48 ++-----
 4 files changed, 138 insertions(+), 104 deletions(-)

diff --git a/ggml/include/ggml-hexagon.h b/ggml/include/ggml-hexagon.h
index 8e37f7da73adf..8e42e3fdb0c5b 100644
--- a/ggml/include/ggml-hexagon.h
+++ b/ggml/include/ggml-hexagon.h
@@ -1,5 +1,5 @@
  /*
- * Copyright (c) 2023-2025 The ggml authors
+ * Copyright (c) 2024-2025 The ggml authors
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
@@ -28,15 +28,15 @@
 extern "C" {
 #endif
 
-#define GGML_HEXAGON_MAX_DEVICES    3
+#define GGML_HEXAGON_MAX_DEVICES    4
 #define GGML_HEXAGON_BACKEND_NAME   "hexagon"
 
 enum HEXAGONBackend {
     HEXAGON_BACKEND_QNNCPU  = 0,
     HEXAGON_BACKEND_QNNGPU  = 1,
     HEXAGON_BACKEND_QNNNPU  = 2,
-    HEXAGON_BACKEND_CDSP    = 2,
-    HEXAGON_BACKEND_GGML    = 3, //"fake" QNN backend for compare performance between HEXAGON backend and ggml backend
+    HEXAGON_BACKEND_CDSP    = 3,
+    HEXAGON_BACKEND_GGML    = 4, //"fake" HEXAGON backend for compare performance between HEXAGON backend and ggml backend
 };
 
 GGML_BACKEND_API ggml_backend_t ggml_backend_hexagon_init(size_t dev_num, const char * qnn_lib_path);
diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index bb8c004c2f812..9ce8199821c30 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -22,7 +22,7 @@ if (CMAKE_BUILD_TYPE STREQUAL "Debug")
     message("Debug mode:${DEBUG_FLAG}")
 else()
     set(DEBUG_FLAG "-DNDEBUG -Wall")
-#manually disable all foreground logs in ggml-hexagon/CMakeLists.txt to
+#manually disable all verbose logs in ggml-hexagon/CMakeLists.txt to
 #make compare NPU performance through llama-bench more clear
 #set(DEBUG_FLAG "-DNDEBUG -Wall -DDISABLE_ALL_LOG")
     message("Release mode:${DEBUG_FLAG}")
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index 9afe6681ca3fa..ee2e02c2922a2 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2025 The ggml authors
+ * Copyright (c) 2024-2025 The ggml authors
  *
  * Qualcomm QNN SDK and reference tech guides could be found at:
  * https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
@@ -136,6 +136,10 @@ struct ggml_backend_hexagon_context;
 #define GGMLHEXAGON_DEBUG                               1
 #endif
 
+#ifndef PROJECT_NAME
+#define PROJECT_NAME                                    "ggml-hexagon"
+#endif
+
 #define GGMLHEXAGON_LOGBUF_LEN                          4096
 #define GGMLHEXAGON_TMPBUF_LEN                          256
 
@@ -146,7 +150,7 @@ struct ggml_backend_hexagon_context;
 #define GGMLHEXAGON_LOG_INFO(...)                       ggmlhexagon_log_internal(GGML_LOG_LEVEL_INFO , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #define GGMLHEXAGON_LOG_VERBOSE(...)                    ggmlhexagon_log_internal(GGML_LOG_LEVEL_CONT , __FILE__, __FUNCTION__, __LINE__, __VA_ARGS__)
 #else
-//manually disable all foreground logs in ggml-hexagon/CMakeLists.txt to
+//manually disable all verbose logs in ggml-hexagon/CMakeLists.txt to
 //make compare NPU performance through llama-bench more clear
 #define GGMLHEXAGON_LOG_INFO(...)
 #define GGMLHEXAGON_LOG_VERBOSE(...)
@@ -351,7 +355,7 @@ struct hexagon_appcfg_t {
     int vtcm_size_in_mb;
     int enable_dlbc;
     int hwaccel_approach;       // 0: HWACCEL_QNN 1: HWACCEL_QNN_SINGLEGRAPH 2: HWACCEL_CDSP
-    int hexagon_backend;        // 0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP
+    int hexagon_backend;        // 0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU 3: HEXAGON_BACKEND_CDSP 4: ggml
     int enable_rpc_ion_mempool; // enable/disable rpc ion memory pool
     int enable_all_q_mulmat;    // enable/disable offload all quantized type mulmat to cDSP
     int profiler_duration;      // threshold of duration in profiler, per seconds
@@ -384,14 +388,17 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
         .thread_counts          = 4,
         .cfgfilename            = "ggml-hexagon.cfg",
 #if defined(__ANDROID__)
-//Android command line program
+    #if defined(STANDARD_ANDROID_APP)
+        .runtime_libpath        = "/data/data/com.kantvai.kantvplayer/",
+    #else
         .runtime_libpath        = "/data/local/tmp/",
+    #endif
 #elif defined(__linux__)
         .qnn_runtimelib_path    = "/tmp/",
 #elif defined(_WIN32)
         .qnn_runtimelib_path    = "C:\\",
 #endif
-        .ggml_hexagon_version   = {"1.06"},
+        .ggml_hexagon_version   = {"1.07"},
         .ggml_dsp_version       = {"0.63"},
 };
 
@@ -503,7 +510,7 @@ static struct ggml_backend_hexagon_context g_hexagon_mgr[GGML_HEXAGON_MAX_DEVICE
                 .rpc_mempool          = nullptr,
                 .rpc_mempool_handle   = 0,
                 .ggmlop_handle        = 0,
-                .domain_id            = HEXAGON_CDSP,
+                .domain_id            = -1,
         },
 
         {       .device               = 1,
@@ -531,7 +538,7 @@ static struct ggml_backend_hexagon_context g_hexagon_mgr[GGML_HEXAGON_MAX_DEVICE
                 .rpc_mempool          = nullptr,
                 .rpc_mempool_handle   = 0,
                 .ggmlop_handle        = 0,
-                .domain_id            = HEXAGON_CDSP,
+                .domain_id            = -1,
         },
 
         {       .device               = 2,
@@ -559,8 +566,31 @@ static struct ggml_backend_hexagon_context g_hexagon_mgr[GGML_HEXAGON_MAX_DEVICE
                 .rpc_mempool          = nullptr,
                 .rpc_mempool_handle   = 0,
                 .ggmlop_handle        = 0,
-                .domain_id            = HEXAGON_CDSP,
+                .domain_id            = -1,
          },
+         {      .device               = 3,
+                .name                 = "Hexagon-cDSP",
+                .desc                 = "Qualcomm NPU(cDSP)",
+                .lib                  = "",
+                .instance             = nullptr,
+                .backend              = nullptr,
+                .raw_interface        = {},
+                .raw_system_interface = {},
+                .socinfo              = {},
+                .qnn_singlenode_graph_map = {},
+                .work_data            = nullptr,
+                .tasks                = {},
+                .work_size            = 0,
+                .desired_size         = 0,
+                .n_threads            = 8,
+                .rpc_mempool_capacity = 0,
+                .rpc_mempool_len      = 0,
+                .rpc_mempool_usage    = 0,
+                .rpc_mempool          = nullptr,
+                .rpc_mempool_handle   = 0,
+                .ggmlop_handle        = 0,
+                .domain_id            = HEXAGON_CDSP,
+        },
 };
 
 static domain hexagon_supported_domains[] = {
@@ -836,7 +866,7 @@ static void ggmlhexagon_log_internal(ggml_log_level level, const char * file, co
         int len = vsnprintf(s_ggmlhexagon_log_internal_buf + len_prefix, GGMLHEXAGON_LOGBUF_LEN - len_prefix, format, args);
         if (len < (GGMLHEXAGON_LOGBUF_LEN - len_prefix)) {
 #if (defined __ANDROID__) || (defined ANDROID)
-            __android_log_print(ANDROID_LOG_INFO, "ggml-hexagon", "%s\n", s_ggmlhexagon_log_internal_buf);
+            __android_log_print(ANDROID_LOG_INFO, PROJECT_NAME, "%s\n", s_ggmlhexagon_log_internal_buf);
             if (GGML_LOG_LEVEL_INFO == level) {
                 printf("%s\n", s_ggmlhexagon_log_internal_buf);
             }
@@ -1915,7 +1945,7 @@ static bool ggmlhexagon_check_valid_appcfg() {
     }
 
     if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
-        if (HEXAGON_BACKEND_CDSP != g_hexagon_appcfg.hexagon_backend) {
+        if ((HEXAGON_BACKEND_CDSP != g_hexagon_appcfg.hexagon_backend) && (HEXAGON_BACKEND_GGML != g_hexagon_appcfg.hexagon_backend)) {
             GGMLHEXAGON_LOG_INFO("hwaccel_approach HWACCEL_CDSP must match with hexagon_backend HEXAGON_BACKEND_CDSP");
             is_valid_appcfg = false;
         }
@@ -5271,6 +5301,9 @@ static void ggmlhexagon_deinit_cdsp(ggml_backend_hexagon_context * ctx) {
 }
 
 static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+
     int hexagon_error               = AEE_SUCCESS;
 
     int domain_id                   = HEXAGON_CDSP;
@@ -5291,7 +5324,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
 
     if (nullptr == ctx)
         return 1;
-    GGMLHEXAGON_LOG_INFO("init Hexagon cDSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
+    GGMLHEXAGON_LOG_DEBUG("init Hexagon cDSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
     if (0 != ctx->ggmlop_handle) {
         GGMLHEXAGON_LOG_DEBUG("already init Hexagon cDSP with backend %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
         return 0;
@@ -5513,7 +5546,7 @@ static void ggmlhexagon_compute(ggml_backend_hexagon_context * ctx, struct ggml_
     memcpy(dsptensor_2.op_params, dst->op_params, GGML_MAX_OP_PARAMS / sizeof(int32_t));
     std::chrono::high_resolution_clock::time_point end_time = std::chrono::high_resolution_clock::now();
     std::chrono::duration<size_t, std::nano> duration = end_time - start_time;
-    GGMLHEXAGON_LOG_VERBOSE("pack duration %llu ns", duration.count());
+    GGMLHEXAGON_LOG_DEBUG("pack duration %llu ns", duration.count());
 
     hexagon_error = op_func(ctx->ggmlop_handle, &dsptensor_0, &dsptensor_1, &dsptensor_2);
     if (AEE_SUCCESS != hexagon_error) {
@@ -5906,10 +5939,9 @@ static const char * ggml_backend_hexagon_buffer_type_name(ggml_backend_buffer_ty
 
 static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
            ggml_backend_buffer_type_t buft, size_t size) {
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
     struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(buft->context);
     GGML_ASSERT(nullptr != ctx);
-    GGMLHEXAGON_LOG_DEBUG("device %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
-
     ggml_backend_hexagon_buffer_context * buffer_ctx = new ggml_backend_hexagon_buffer_context;
 
     size_t size_page = 0;
@@ -5924,7 +5956,10 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
     if (0 != (size_aligned % size_page)) {
         size_aligned += (size_page - (size_aligned % size_page));
     }
+
     if ((HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
+        GGMLHEXAGON_LOG_DEBUG("device %d(%s)", ctx->device, ggml_backend_hexagon_get_devname(ctx->device));
+        GGML_ASSERT(nullptr != ctx->rpc_mempool);
         GGML_ASSERT(size + ctx->rpc_mempool_usage <= ctx->rpc_mempool_len);
         buffer_ctx->buffer = (static_cast<char*>(ctx->rpc_mempool)) + ctx->rpc_mempool_usage;
         GGMLHEXAGON_LOG_DEBUG("size %d(%d MiB), buffer_ctx->buffer %p", size, size / SIZE_IN_MB, buffer_ctx->buffer);
@@ -5940,7 +5975,7 @@ static ggml_backend_buffer_t ggml_backend_hexagon_buffer_type_alloc_buffer(
     } else {
         //GGMLHEXAGON_LOG_DEBUG("%s: succeed to allocate %d MiB\n", __func__, size / SIZE_IN_MB);
     }
-
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
     return ggml_backend_buffer_init(buft, ggml_backend_hexagon_buffer_interface, buffer_ctx, size);
 }
 
@@ -6110,9 +6145,6 @@ static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_
 
 static enum ggml_backend_dev_type ggml_backend_hexagon_device_get_type(ggml_backend_dev_t dev) {
     struct ggml_backend_hexagon_context * ctx = static_cast<ggml_backend_hexagon_context *>(dev->context);
-    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
-        return GGML_BACKEND_DEVICE_TYPE_GPU;
-    }
 
     if (HEXAGON_BACKEND_QNNCPU == ctx->device)
         return GGML_BACKEND_DEVICE_TYPE_ACCEL;
@@ -6120,6 +6152,8 @@ static enum ggml_backend_dev_type ggml_backend_hexagon_device_get_type(ggml_back
         return GGML_BACKEND_DEVICE_TYPE_ACCEL;
     else if (HEXAGON_BACKEND_QNNNPU == ctx->device)
         return GGML_BACKEND_DEVICE_TYPE_ACCEL;
+    else if (HEXAGON_BACKEND_CDSP == ctx->device)
+        return GGML_BACKEND_DEVICE_TYPE_GPU;
     else
         return GGML_BACKEND_DEVICE_TYPE_CPU;
 }
@@ -6173,38 +6207,60 @@ static ggml_backend_t ggml_backend_hexagon_device_init_backend(ggml_backend_dev_
     GGMLHEXAGON_LOG_DEBUG("leave %s\n", __func__);
 
     return hexagon_backend;
-
 }
 
 static ggml_backend_buffer_type_t ggml_backend_hexagon_buffer_type(size_t device_index) {
+    static std::mutex mutex;
+    std::lock_guard<std::mutex> lock(mutex);
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__ );
     if (device_index >= GGML_HEXAGON_MAX_DEVICES) {
         GGMLHEXAGON_LOG_DEBUG("ggml_backend_hexagon_buffer_type error: device_index:%d is out of range [0, %d]\n",
                       device_index, GGML_HEXAGON_MAX_DEVICES - 1);
         return nullptr;
     }
 
-    static struct ggml_backend_buffer_type ggml_backend_buffer_type_hexagon = {
-            /* .iface   = */ {
-                                     /* .get_name         = */ ggml_backend_hexagon_buffer_type_name,
-                                     /* .alloc_buffer     = */ ggml_backend_hexagon_buffer_type_alloc_buffer,
-                                     /* .get_alignment    = */ ggml_backend_hexagon_buffer_type_get_alignment,
-                                     /* .get_max_size     = */ ggml_backend_hexagon_buffer_type_get_max_size,
-                                     /* .get_alloc_size   = */ nullptr,// defaults to ggml_nbytes
-                                     /* .is_host          = */ ggml_backend_hexagon_buffer_is_host
-                             },
-            /* .device  = */ nullptr,
-            /* .context = */ &g_hexagon_mgr[device_index],
-    };
+    if (device_index != (size_t)(g_hexagon_appcfg.hexagon_backend)) {
+        //cover following special case:
+        //      toggle backend and forth between cDSP and ggml in a standard Android APP or in
+        //      a same running process
+        g_hexagon_appcfg.hexagon_backend = device_index;
+    }
+
+    static struct ggml_backend_buffer_type ggml_backend_hexagon_buffer_types[GGML_HEXAGON_MAX_DEVICES];
+    static bool ggml_backend_hexagon_buffer_type_initialized = false;
+    if (!ggml_backend_hexagon_buffer_type_initialized) {
+        for (int i = 0; i < GGML_HEXAGON_MAX_DEVICES; i++) {
+            ggml_backend_hexagon_buffer_types[i] = {
+                    /* .iface   = */ {
+                                             /* .get_name         = */ ggml_backend_hexagon_buffer_type_name,
+                                             /* .alloc_buffer     = */ ggml_backend_hexagon_buffer_type_alloc_buffer,
+                                             /* .get_alignment    = */ ggml_backend_hexagon_buffer_type_get_alignment,
+                                             /* .get_max_size     = */ ggml_backend_hexagon_buffer_type_get_max_size,
+                                             /* .get_alloc_size   = */ nullptr,// defaults to ggml_nbytes
+                                             /* .is_host          = */ ggml_backend_hexagon_buffer_is_host
+                                     },
+                    /* .device  = */ ggml_backend_reg_dev_get(ggml_backend_hexagon_reg(), i),
+                    /* .context = */ &g_hexagon_mgr[device_index],
+            };
+        }
+        ggml_backend_hexagon_buffer_type_initialized = true;
+    }
+
 
     if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
-        //here is the trick:
-        //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
-        //and we need to re-use the g_hexagon_mgr
-        //so context is g_hexagon_mgr[HEXAGON_BACKEND_CDSP] rather than g_hexagon_mgr[0]
-        ggml_backend_buffer_type_hexagon.context = &g_hexagon_mgr[HEXAGON_BACKEND_CDSP];
+        GGML_ASSERT(HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend);
+        //FIXME:this is workaround for cover following special case:
+        //      toggle back and forth between cDSP and ggml in a standard Android APP or in a same running process
+        //      there is unknown issue with this workaround when toggle back and forth frequently in a standard Android APP
+        int result = ggmlhexagon_init_dsp(&g_hexagon_mgr[HEXAGON_BACKEND_CDSP]);
+        if (0 != result) {
+            GGMLHEXAGON_LOG_INFO("init hexagon dsp failure");
+            return nullptr;
+        }
     }
 
-    return &ggml_backend_buffer_type_hexagon;
+    GGMLHEXAGON_LOG_DEBUG("leave %s", __func__ );
+    return &ggml_backend_hexagon_buffer_types[device_index];
 }
 
 static const char * ggml_backend_hexagon_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
@@ -6357,13 +6413,13 @@ static void ggml_backend_hexagon_set_n_threads(ggml_backend_t backend, int n_thr
 
 int ggml_backend_hexagon_get_device_count() {
     if (g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) {
-        GGML_ASSERT(g_hexagon_appcfg.hexagon_backend == HEXAGON_BACKEND_CDSP);
         //here is the trick:
         //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
         //so return 1
         return 1;
     } else {
-        return GGML_HEXAGON_MAX_DEVICES;
+        //QNN-CPU, QNN-GPU, QNN-NPU
+        return GGML_HEXAGON_MAX_DEVICES - 1;
     }
 }
 
@@ -6377,23 +6433,19 @@ static const char * ggml_backend_hexagon_reg_get_name(ggml_backend_reg_t reg) {
 
     //return accurate backend name rather than "ggml-hexagon" to
     //make compare NPU performance through llama-bench more clear
-    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
-        GGML_ASSERT(HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend);
-        return "Hexagon-cDSP";
-    }
+    if (HEXAGON_BACKEND_QNNNPU == g_hexagon_appcfg.hexagon_backend)
+        return "QNN-NPU";
 
-    if (HWACCEL_QNN == g_hexagon_appcfg.hwaccel_approach) {
-        if (HEXAGON_BACKEND_QNNNPU == g_hexagon_appcfg.hexagon_backend)
-            return "QNN-NPU";
+    if (HEXAGON_BACKEND_QNNGPU == g_hexagon_appcfg.hexagon_backend)
+        return "QNN-GPU";
 
-        if (HEXAGON_BACKEND_QNNGPU == g_hexagon_appcfg.hexagon_backend)
-            return "QNN-GPU";
+    if (HEXAGON_BACKEND_QNNCPU == g_hexagon_appcfg.hexagon_backend)
+        return "QNN-CPU";
 
-        if (HEXAGON_BACKEND_QNNCPU == g_hexagon_appcfg.hexagon_backend)
-            return "QNN-CPU";
-    }
+    if (HEXAGON_BACKEND_CDSP == g_hexagon_appcfg.hexagon_backend)
+        return "Hexagon-cDSP";
 
-    return "unknown";
+    return "ggml";
 }
 
 static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg) {
@@ -6405,7 +6457,8 @@ static size_t ggml_backend_hexagon_reg_get_device_count(ggml_backend_reg_t reg)
         //so return 1
         return 1;
     } else {
-        return GGML_HEXAGON_MAX_DEVICES;
+        //QNN-CPU, QNN-GPU, QNN-NPU
+        return GGML_HEXAGON_MAX_DEVICES - 1;
     }
 }
 
@@ -6422,7 +6475,7 @@ static ggml_backend_dev_t ggml_backend_hexagon_reg_get_device(ggml_backend_reg_t
         //so return ctx->devices[0]
         return ctx->devices[0];
     } else {
-        GGML_ASSERT(index < ctx->devices.size());
+        GGML_ASSERT(index <= ctx->devices.size());
         return ctx->devices[index];
     }
 }
@@ -6437,6 +6490,7 @@ static void * ggml_backend_hexagon_reg_get_proc_address(ggml_backend_reg_t reg,
     if (0 == memcmp(name, slot_name, strlen(slot_name))) {
         return (void *)ggml_backend_hexagon_set_n_threads;
     }
+
     return nullptr;
 }
 
@@ -6449,8 +6503,12 @@ static const ggml_backend_reg_i ggml_backend_hexagon_reg_interface = {
 
 ggml_backend_reg_t ggml_backend_hexagon_reg() {
     static ggml_backend_reg reg;
+    //TODO: the existing codes can't cover following special case:
+    //      toggle back and forth between QNN-NPU and cDSP and ggml in a standard Android APP or in
+    //      a same running process
+    //      supportive of such special case is easy but it will significantly increase the size of APK
     static bool initialized = false;
-    GGMLHEXAGON_LOG_DEBUG("enter ggml_backend_hexagon_reg");
+    GGMLHEXAGON_LOG_DEBUG("enter %s", __func__);
 
     //case-2: normal scenario, such as llama-cli or UI applicaton
     ggmlhexagon_load_cfg();
@@ -6487,10 +6545,11 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
                 if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
                     //here is the trick:
                     //there only 1 backend_device when g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP
-                    //and we need to re-use the g_hexagon_mgr
                     //so context is g_hexagon_mgr[HEXAGON_BACKEND_CDSP] rather than g_hexagon_mgr[0]
+                    //attention here:
                     dev->context = &g_hexagon_mgr[HEXAGON_BACKEND_CDSP];
                 }
+
                 ctx->devices.push_back(dev);
 
                 //here is the trick: make cDSP rpc memory pool happy because ggml's backend subsystem need this
@@ -6499,8 +6558,9 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
                     int result = ggmlhexagon_init_dsp(&g_hexagon_mgr[HEXAGON_BACKEND_CDSP]);
                     if (0 != result) {
                         GGMLHEXAGON_LOG_INFO("init hexagon dsp failure");
+                        return nullptr;
                     }
-                    GGML_ASSERT(0 == result);
+                    //GGML_ASSERT(0 == result);
                 }
             }
 
@@ -6519,12 +6579,6 @@ ggml_backend_reg_t ggml_backend_hexagon_reg() {
 }
 
 const char * ggml_backend_hexagon_get_devname(size_t dev_num) {
-    if (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach) {
-       if (HEXAGON_BACKEND_CDSP == dev_num)
-            return "HEXAGON_BACKEND_CDSP";
-    }
-
-    //here is the trick: fall back for various scenarios
     switch (dev_num) {
         case HEXAGON_BACKEND_QNNCPU:
             return "HEXAGON_BACKEND_QNN_CPU";
@@ -6532,6 +6586,8 @@ const char * ggml_backend_hexagon_get_devname(size_t dev_num) {
             return "HEXAGON_BACKEND_QNN_GPU";
         case HEXAGON_BACKEND_QNNNPU:
             return "HEXAGON_BACKEND_QNN_NPU";
+        case HEXAGON_BACKEND_CDSP:
+            return "HEXAGON_BACKEND_CDSP";
         case HEXAGON_BACKEND_GGML:
             return "ggml"; //"fake" hexagon backend, used for compare performance between hexagon backend and the default ggml backend
         default:
@@ -6571,7 +6627,7 @@ static qnn_instance * ggmlqnn_init_qnn_instance(size_t device, const char * qnn_
 
 /**
  *
- * @param device            0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU/HEXAGON_BACKEND_CDSP
+ * @param device            0: HEXAGON_BACKEND_QNNCPU 1: HEXAGON_BACKEND_QNNGPU 2: HEXAGON_BACKEND_QNNNPU 3: HEXAGON_BACKEND_CDSP 4: ggml
  * @param runtime_libpath   binary runtime library path, such as "/data/local/tmp/" on Android or specified in user's code
  * @return
  */
diff --git a/scripts/ggml-hexagon.cfg b/scripts/ggml-hexagon.cfg
index 2ae65dcfec671..eb1c2fe5ca0c4 100644
--- a/scripts/ggml-hexagon.cfg
+++ b/scripts/ggml-hexagon.cfg
@@ -1,37 +1,15 @@
-#
-# Copyright (c) 2023-2025 The ggml authors
-#
-#  Permission is hereby granted, free of charge, to any person obtaining a copy
-#  of this software and associated documentation files (the "Software"), to
-#  deal in the Software without restriction, including without limitation the
-#  rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
-#  sell copies of the Software, and to permit persons to whom the Software is
-#  furnished to do so, subject to the following conditions:
-#
-#  The above copyright notice and this permission notice shall be included in
-#  all copies or substantial portions of the Software.
-#
-#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-#  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-#  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-#  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-#  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-#  IN THE SOFTWARE.
-#
-#  runtime configuration for ggml-hexagon backend
-#
 [general]
 #version of ggml-hexagon.cpp on ARM-AP side
-version = "1.06"
+version = "1.07"
 #version of ggml-dsp.c on cDSP side
 ggmldsp_version = "0.63"
 
 #0: HEXAGON_BACKEND_QNNCPU
 #1: HEXAGON_BACKEND_QNNGPU
-#2: HEXAGON_BACKEND_QNNNPU / HEXAGON_BACKEND_CDSP
-#3: default ggml backend
-hexagon_backend  = 2
+#2: HEXAGON_BACKEND_QNNNPU
+#3: HEXAGON_BACKEND_CDSP
+#4: default ggml backend
+hexagon_backend  = 3
 # 0: hwaccel approach through HWACCEL_QNN: offload ggml op to QNN
 # 1: hwaccel approach through HWACCEL_QNN_SINGLEGRAPH: mapping entire ggml cgraph to a single QNN graph
 # 2: hwaccel approach through HWACCEL_CDSP:offload ggml op to cDSP directly
@@ -40,20 +18,20 @@ hwaccel_approach = 2
 #attention:
 #          a. HWACCEL_QNN_SINGLEGRAPH not supported at the moment;
 #          b. following combinations are valid:
-#             1: hwaccel_approach = 2 AND hexagon_backend = 2(HWACCEL_CDSP, this is the default setting)
-#             2: hwaccel_approach = 0 AND hexagon_backend = 2(HWACCEL_QNN, aka QNNNPU)
+#             1: hwaccel_approach = 2 AND hexagon_backend = 3(HWACCEL_CDSP, this is the default setting)
+#             2: hwaccel_approach = 0 AND hexagon_backend = 2(QNNNPU)
 #             3: hwaccel_approach = 0 AND hexagon_backend = 1(QNNGPU)
 #             4: hwaccel_approach = 0 AND hexagon_backend = 0(QNNCPU)
-#             5: hwaccel_approach = 2 AND hexagon_backend = 3(fall back to the default ggml backend)
-#             6: hwaccel_approach = 0 AND hexagon_backend = 3(fall back to the default ggml backend)
+#             5: hwaccel_approach = 2 AND hexagon_backend = 4(fall back to the default ggml backend)
+#             6: hwaccel_approach = 0 AND hexagon_backend = 4(fall back to the default ggml backend)
 #
 #generally speaking,
-#          a. we only need to focus on b-1(HWACCEL_CDSP) and b-2(HWACCEL_QNN, aka QNNNPU).
-#          b. we can compare Hexagon NPU performance between HWACCEL_CDSP/HWACCEL_QNN(QNNNPU)/the default ggml backend accordingly
+#          a. we only need to focus on b-1(HWACCEL_CDSP) and b-2(QNNNPU).
+#          b. we can compare Hexagon NPU performance between HWACCEL_CDSP/QNNNPU/the default ggml backend accordingly
 
 
 #enable/disable offload quantized type mulmat
-#quatized type mulmat works fine through HWACCEL_QNN at the moment
+#quatized type mulmat works fine through QNNNPU at the moment
 #quatized type mulmat doesn't works fine through HWACCEL_CDSP at the moment
 #this item will make mulmat performance comprision easily
 enable_q_mulmat = 0
@@ -70,7 +48,7 @@ dump_op_info = 0
 enable_perf = 1
 
 
-# enablie/disable profiler feature to visually compare NPU performance between HWACCEL_CDSP and HWACCEL_QNN
+# enablie/disable profiler feature to visually compare NPU performance between HWACCEL_CDSP and QNNNPU
 # this is default setting
 enable_profiler = 0
 #threshold duration of NPU performance profiler, per seconds

From b6072fa653b076732f678c490c53c5c2e24f9bbb Mon Sep 17 00:00:00 2001
From: zhouwg <zhouwg2000@gmail.com>
Date: Tue, 29 Apr 2025 09:26:32 +0800
Subject: [PATCH 197/200] sync with upstream llama.cpp and sync
 ggml-hexagon.cpp from project kantv

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index ee2e02c2922a2..c088d8d68f89d 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -654,6 +654,7 @@ static constexpr const qnn_op_caps ggmlqnn_k_op_caps[] = {
         {false, GGML_OP_CONV_TRANSPOSE_1D, 0, nullptr},
         {false, GGML_OP_IM2COL, 0, nullptr},
         {false, GGML_OP_IM2COL_BACK, 0, nullptr},
+        {false, GGML_OP_CONV_2D_DW, 0, nullptr},
         {false, GGML_OP_CONV_TRANSPOSE_2D, 0, nullptr},
         {false, GGML_OP_POOL_1D, 0, nullptr},
         {false, GGML_OP_POOL_2D, 0, nullptr},
@@ -760,6 +761,7 @@ static constexpr const hexagon_op_caps ggmlhexagon_k_op_caps[] = {
         {false, GGML_OP_CONV_TRANSPOSE_1D, 0, nullptr, nullptr},
         {false, GGML_OP_IM2COL, 0, nullptr, nullptr},
         {false, GGML_OP_IM2COL_BACK, 0, nullptr, nullptr},
+        {false, GGML_OP_CONV_2D_DW, 0, nullptr, nullptr},
         {false, GGML_OP_CONV_TRANSPOSE_2D, 0, nullptr, nullptr},
         {false, GGML_OP_POOL_1D, 0, nullptr, nullptr},
         {true, GGML_OP_POOL_2D, 1, "ggmlop_dsp_pool2d", ggmlop_dsp_pool2d},
@@ -5574,6 +5576,7 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
     const ggml_tensor * src0 = op_tensor->src[0];
     const ggml_tensor * src1 = op_tensor->src[1];
     const int src0_rank      = ggml_n_dims(src0);
+    const int64_t ne00       = src0->ne[0];
     int src1_rank            = 0;
     if (nullptr != src1) {
         src1_rank = ggml_n_dims(src1);
@@ -5581,6 +5584,13 @@ static bool ggmlhexagon_can_handle_op_through_cdsp(ggml_backend_dev_t dev, const
     switch (op_tensor->op) {
         case GGML_OP_ADD:
         {
+            //TODO:workaround approach to fix HWACCEL_CDSP can't works in ASR inference and  LLM inference
+            //     with some LLM models in a standard Android APP
+            //     one more thing, I think the latest QNN SDK's internal also use the similar approach
+            if (ne00 < 1024) {
+                return false;
+            }
+
             if (!ggml_are_same_shape(src0, src1)) {
                 return false;
             }

From 28565d1d8b189e6c9805d7fba103ab7322725041 Mon Sep 17 00:00:00 2001
From: l3utterfly <gc.pthzfoldr@gmail.com>
Date: Wed, 30 Apr 2025 13:48:01 +0800
Subject: [PATCH 198/200] Enhance ggml_backend_reg_layla to support Hexagon
 backend and update CMake configuration for Hexagon SDK integration

---
 ggml/include/ggml-backend.h            |  2 +-
 ggml/src/ggml-backend-reg.cpp          |  8 +++-
 ggml/src/ggml-hexagon/CMakeLists.txt   | 58 ++++++++++++++------------
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 39 +++++++++++++----
 ggml/src/ggml-hexagon/kernels/stub.c   | 12 ++++--
 5 files changed, 78 insertions(+), 41 deletions(-)

diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
index 9002406d96e72..1f88479872e61 100644
--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
@@ -202,7 +202,7 @@ extern "C" {
     //
     // Backend registry
     //
-    GGML_API void               ggml_backend_reg_layla(bool useVulkan, bool useOpenCL);
+    GGML_API void               ggml_backend_reg_layla(bool useVulkan, bool useOpenCL, bool useHexagon);
 
     GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
 
diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp
index bc9161ad80306..d356ab3d74853 100644
--- a/ggml/src/ggml-backend-reg.cpp
+++ b/ggml/src/ggml-backend-reg.cpp
@@ -161,6 +161,7 @@ struct ggml_backend_reg_entry {
 
 static bool laylaUseVulkan = false;
 static bool laylaUseOpenCL = false;
+static bool laylaUseHexagon = false;
 
 struct ggml_backend_registry {
     std::vector<ggml_backend_reg_entry> backends;
@@ -199,7 +200,9 @@ struct ggml_backend_registry {
         register_backend(ggml_backend_kompute_reg());
 #endif
 #ifdef GGML_USE_HEXAGON
-        register_backend(ggml_backend_hexagon_reg());
+        if(laylaUseHexagon) {
+            register_backend(ggml_backend_hexagon_reg());
+        }
 #endif
 #ifdef GGML_USE_CPU
         register_backend(ggml_backend_cpu_reg());
@@ -310,9 +313,10 @@ struct ggml_backend_registry {
     }
 };
 
-void ggml_backend_reg_layla(bool useVulkan, bool useOpenCL) {
+void ggml_backend_reg_layla(bool useVulkan, bool useOpenCL, bool useHexagon) {
     laylaUseVulkan = useVulkan;
     laylaUseOpenCL = useOpenCL;
+    laylaUseHexagon = useHexagon;
 }
 
 static ggml_backend_registry & get_reg() {
diff --git a/ggml/src/ggml-hexagon/CMakeLists.txt b/ggml/src/ggml-hexagon/CMakeLists.txt
index 9ce8199821c30..3353580833337 100644
--- a/ggml/src/ggml-hexagon/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/CMakeLists.txt
@@ -33,42 +33,45 @@ endif()
 #v73 --- Snapdragon 8 Gen2
 #v75 --- Snapdragon 8 Gen3
 #v79 --- Snapdragon 8 Elite(aka Gen4)
-if(NOT DEFINED HTP_ARCH_VERSION)
-    message(FATAL_ERROR "HTP_ARCH_VERSION not defined, valid htp arch: v68,v69,v73,v75,v79")
-endif()
+# we do not use HTP_ARCH_VERSION right now because we don't use raw cdsp calls
+#if(NOT DEFINED HTP_ARCH_VERSION)
+#    message(FATAL_ERROR "HTP_ARCH_VERSION not defined, valid htp arch: v68,v69,v73,v75,v79")
+#endif()
 
 #check whether user's specified htp arch is valid
-set(CHECK_HTP_ARCH "WRONG")
-foreach (feat v68 v69 v73 v75 v79)
-    if (${feat} STREQUAL ${HTP_ARCH_VERSION})
-        set(CHECK_HTP_ARCH "GOOD")
-    endif()
-endforeach()
-if (${CHECK_HTP_ARCH} STREQUAL "WRONG")
-    message(FATAL_ERROR "ggml-hexagon backend only support htp arch v68,v69,v73,v75,v79")
-endif()
+#set(CHECK_HTP_ARCH "WRONG")
+#foreach (feat v68 v69 v73 v75 v79)
+#    if (${feat} STREQUAL ${HTP_ARCH_VERSION})
+#        set(CHECK_HTP_ARCH "GOOD")
+#    endif()
+#endforeach()
+#if (${CHECK_HTP_ARCH} STREQUAL "WRONG")
+#    message(FATAL_ERROR "ggml-hexagon backend only support htp arch v68,v69,v73,v75,v79")
+#endif()
 
 #check optimization flags
 set(OPT_FLAG " ")
-if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
+#if (${HTP_ARCH_VERSION} STREQUAL "v75" OR ${HTP_ARCH_VERSION} STREQUAL "v79")
     #works fine on Snapdragon 8Gen3&8Elite with 1.5x - 3x performance gains with the default ggml backend
-    set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -flto -D_GNU_SOURCE -fvectorize -ffp-model=fast -fno-finite-math-only")
-endif()
-message("OPT_FLAG:${OPT_FLAG}")
+#    set(OPT_FLAG " -O3 -march=armv8.7-a -mcpu=cortex-x1 -mtune=cortex-x1 -flto -D_GNU_SOURCE -fvectorize -ffp-model=fast -fno-finite-math-only")
+#endif()
+#message("OPT_FLAG:${OPT_FLAG}")
 
 if(CMAKE_SYSTEM_NAME STREQUAL "Android")
     find_library(LOG_LIB log)
 
-    add_library(cdsprpc
-        SHARED
-        IMPORTED)
-    set_target_properties(cdsprpc
-        PROPERTIES
-        IMPORTED_LOCATION
-        ${HEXAGON_SDK_PATH}/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc.so)
+    # we do not use libcdsprpc.so provided in the Hexagon SDK, we will look for the one installed by the user's phone vendor
+    #add_library(cdsprpc
+    #    SHARED
+    #    IMPORTED)
+    #set_target_properties(cdsprpc
+    #    PROPERTIES
+    #    IMPORTED_LOCATION
+    #    ${HEXAGON_SDK_PATH}/ipc/fastrpc/remote/ship/android_aarch64/libcdsprpc.so)
 
-    set(QNN_LINK_LIBRARIES ${LOG_LIB} cdsprpc)
-    set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
+    #set(QNN_LINK_LIBRARIES ${LOG_LIB} cdsprpc)
+    set(QNN_LINK_LIBRARIES ${LOG_LIB})
+    #set(QNN_DEFAULT_LIB_SEARCH_PATH "/data/local/tmp/" CACHE STRING "customized library search path for QNN backend")
 
     include_directories(${HEXAGON_SDK_PATH}/incs)
     include_directories(${HEXAGON_SDK_PATH}/incs/stddef)
@@ -129,5 +132,6 @@ function(ggml_hexagon_setup_cfg KNAME)
     )
 endfunction()
 
-ggml_hexagon_build_kernel("cdsp")
-ggml_hexagon_setup_cfg("ggml-hexagon.cfg")
+# we do not build cdsp kernels directly in CMake
+#ggml_hexagon_build_kernel("cdsp")
+#ggml_hexagon_setup_cfg("ggml-hexagon.cfg")
diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index c088d8d68f89d..dd2d4f1b628ce 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -391,7 +391,7 @@ static struct hexagon_appcfg_t g_hexagon_appcfg = {
     #if defined(STANDARD_ANDROID_APP)
         .runtime_libpath        = "/data/data/com.kantvai.kantvplayer/",
     #else
-        .runtime_libpath        = "/data/local/tmp/",
+        .runtime_libpath        = "/data/data/com.layla/files/app-data/qnn-inference/",
     #endif
 #elif defined(__linux__)
         .qnn_runtimelib_path    = "/tmp/",
@@ -1829,24 +1829,31 @@ static void ggmlhexagon_set_runtime_path(size_t device, const std::string & path
     if ((HEXAGON_BACKEND_QNNNPU == device) || (HWACCEL_CDSP == g_hexagon_appcfg.hwaccel_approach)) {
         std::string lib_runtime_path = path + ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images";
         if (0 == setenv("LD_LIBRARY_PATH", lib_runtime_path.c_str(), 1)) {
-            GGMLHEXAGON_LOG_DEBUG("setenv LD_LIBRARY_PATH %s successfully", lib_runtime_path.c_str());
+            GGMLHEXAGON_LOG_INFO("setenv LD_LIBRARY_PATH %s successfully", lib_runtime_path.c_str());
         } else {
             GGMLHEXAGON_LOG_ERROR("setenv LD_LIBRARY_PATH %s failure", lib_runtime_path.c_str());
         }
 
         std::string adsp_runtime_path = path + ";/vendor/dsp/cdsp;/vendor/lib/rfsa/adsp;/system/lib/rfsa/adsp;/vendor/dsp/dsp;/vendor/dsp/images;/dsp";
         if (0 == setenv("ADSP_LIBRARY_PATH", adsp_runtime_path.c_str(), 1)) {
-            GGMLHEXAGON_LOG_DEBUG("setenv ADSP_LIBRARY_PATH %s successfully", adsp_runtime_path.c_str());
+            GGMLHEXAGON_LOG_INFO("setenv ADSP_LIBRARY_PATH %s successfully", adsp_runtime_path.c_str());
         } else {
             GGMLHEXAGON_LOG_ERROR("setenv ADSP_LIBRARY_PATH %s failure", adsp_runtime_path.c_str());
         }
+
+        std::string dsp_runtime_path = path;
+        if (0 == setenv("DSP_LIBRARY_PATH", dsp_runtime_path.c_str(), 1)) {
+            GGMLHEXAGON_LOG_INFO("setenv DSP_LIBRARY_PATH %s successfully", dsp_runtime_path.c_str());
+        } else {
+            GGMLHEXAGON_LOG_ERROR("setenv DSP_LIBRARY_PATH %s failure", dsp_runtime_path.c_str());
+        }
     } else {
         if (0 == setenv("LD_LIBRARY_PATH",
                         (path +
                          ":/vendor/dsp/cdsp:/vendor/lib64:/vendor/dsp/dsp:/vendor/dsp/images").c_str(),
                         1)) {
             GGMLHEXAGON_LOG_DEBUG("%s backend setenv successfully\n",
-                                 ggml_backend_hexagon_get_devname(device));
+                                  ggml_backend_hexagon_get_devname(device));
         } else {
             GGMLHEXAGON_LOG_ERROR("%s backend setenv failure\n",
                                   ggml_backend_hexagon_get_devname(device));
@@ -3375,10 +3382,10 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
 
 #if defined(__ANDROID__) || defined(__linux__)
     std::filesystem::path full_path(std::string(g_hexagon_appcfg.runtime_libpath) + "libcdsprpc.so");
-    full_path /= std::filesystem::path("libcdsprpc.so").filename();
+    //full_path /= std::filesystem::path("libcdsprpc.so").filename();
     _rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
     if (nullptr == _rpc_lib_handle) {
-        GGMLHEXAGON_LOG_WARN("failed to load %s\n", full_path.c_str());
+        GGMLHEXAGON_LOG_WARN("failed to load %s from local file, trying to find in system libraries\n", full_path.c_str());
         _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
     }
 #else
@@ -5185,6 +5192,8 @@ static int ggmlhexagon_request_status_notifications(int domain_id, void * contex
 }
 
 static int ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
+    throw std::runtime_error("Not implemented. Directly initialising RPC memory pool is not supported right now.");
+
     size_t candidate_size   = 0;
     uint8_t * rpc_buffer    = nullptr;
     size_t probe_slots[]    = {1024, 1536, 2000, 2048};
@@ -5232,6 +5241,8 @@ static int ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
 }
 
 static void ggmlhexagon_deinit_rpcmempool(ggml_backend_hexagon_context * ctx) {
+    throw std::runtime_error("Not implemented. Directly initialising RPC memory pool is not supported right now.");
+
     if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
         if (ctx->rpc_mempool) {
             //deregister rpc memory pool
@@ -6233,7 +6244,11 @@ static ggml_backend_buffer_type_t ggml_backend_hexagon_buffer_type(size_t device
         //cover following special case:
         //      toggle backend and forth between cDSP and ggml in a standard Android APP or in
         //      a same running process
-        g_hexagon_appcfg.hexagon_backend = device_index;
+
+        // TODO: not sure why we need to update the global setting here in the original code
+        // it seems this code is reached when we allocate buffers for all devices (including the qnn-cpu device)
+        // so if it reaches this code, then it won't use the NPU anymore since the backend config will be updated to use the cpu device
+        // g_hexagon_appcfg.hexagon_backend = device_index;
     }
 
     static struct ggml_backend_buffer_type ggml_backend_hexagon_buffer_types[GGML_HEXAGON_MAX_DEVICES];
@@ -6284,6 +6299,10 @@ static const char * ggml_backend_hexagon_host_buffer_name(ggml_backend_buffer_t
 }
 
 static void ggml_backend_hexagon_host_buffer_free(ggml_backend_buffer_t buffer) {
+    // always use ggml memory management for now
+    ggml_aligned_free(buffer->context, 0);
+    return;
+
     if (0 == g_hexagon_appcfg.enable_pinned_memory) {
         ggml_aligned_free(buffer->context, 0);
     } else {
@@ -6292,6 +6311,9 @@ static void ggml_backend_hexagon_host_buffer_free(ggml_backend_buffer_t buffer)
 }
 
 static void * ggml_hexagon_host_malloc(ggml_backend_buffer_type_t buft, size_t size) {
+    // we always use ggml malloc right now
+    return ggml_aligned_malloc(size);
+
     if (0 == g_hexagon_appcfg.enable_pinned_memory) {
         return ggml_aligned_malloc(size);
     } else {
@@ -6664,6 +6686,9 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * runtime_lib
         ggmlhexagon_set_runtime_path(device, runtime_libpath);
     }
 
+    // the condition above never be true because our hardcoded runtime_libpath is always the same as the config, so we manually set the library paths here
+    ggmlhexagon_set_runtime_path(g_hexagon_appcfg.hexagon_backend, g_hexagon_appcfg.runtime_libpath);
+
     if (nullptr != g_hexagon_mgr[device].backend) {
         GGMLHEXAGON_LOG_DEBUG("backend %d(%s) already loaded", device,
                          ggml_backend_hexagon_get_devname(device));
diff --git a/ggml/src/ggml-hexagon/kernels/stub.c b/ggml/src/ggml-hexagon/kernels/stub.c
index 6074d243610df..58cfd1d00eea6 100644
--- a/ggml/src/ggml-hexagon/kernels/stub.c
+++ b/ggml/src/ggml-hexagon/kernels/stub.c
@@ -291,10 +291,12 @@ __QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {8,&(methodArrays[
 extern "C" {
 #endif
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_open)(const char* uri, remote_handle64* h) __QAIC_STUB_ATTRIBUTE {
-   return __QAIC_REMOTE(remote_handle64_open)(uri, h);
+    return -1;  // don't support direct dsp calls yet
+   //return __QAIC_REMOTE(remote_handle64_open)(uri, h);
 }
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_close)(remote_handle64 h) __QAIC_STUB_ATTRIBUTE {
-   return __QAIC_REMOTE(remote_handle64_close)(h);
+    return -1;  // don't support direct dsp calls yet
+    //return __QAIC_REMOTE(remote_handle64_close)(h);
 }
 static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint32_t _in0[1], uint32_t _in1[1], uint32_t _in2[1], uint32_t _in3[1]) {
    remote_arg _pra[1] = {0};
@@ -306,7 +308,8 @@ static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint32_
    _COPY(_primIn, 4, _in1, 0, 4);
    _COPY(_primIn, 8, _in2, 0, 4);
    _COPY(_primIn, 12,_in3, 0, 4);
-   _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _pra));
+   // TODO: we don't support direct dsp calls yet
+   //_TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _pra));
    _CATCH_FARF(_nErr) {
       _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _mid, __func__);
    }
@@ -432,7 +435,8 @@ static __inline int _stub_method_1(remote_handle64 _handle, uint32_t _mid, uintp
    _TRY(_nErr, _stub_pack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 224), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))));
    _QAIC_ASSERT(_nErr, (_numInH[0] + 0) <= 15);
    _QAIC_ASSERT(_nErr, (_numROutH[0] + 0) <= 15);
-   _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _pra));
+   // TODO: we don't support direct dsp calls yet
+   //_TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _pra));
    _TRY(_nErr, _stub_unpack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))));
    _QAIC_CATCH(_nErr) {}
    _CATCH_FARF(_nErr) {

From 1e9db915ea4539675e7193f0d59a26e4092cffb0 Mon Sep 17 00:00:00 2001
From: l3utterfly <gc.pthzfoldr@gmail.com>
Date: Sat, 17 May 2025 00:15:41 +0800
Subject: [PATCH 199/200] Refactor memory allocation and method stubs in
 ggml-hexagon

- Improved code readability by aligning struct members and function parameters.
- Enhanced error handling in _stub_method and related functions.
- Updated memory allocation logic in _allocator_alloc to ensure proper alignment.
- Removed commented-out code and unnecessary error messages for clarity.
- Ensured consistent formatting across the file for better maintainability.
---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 197 ++++++----
 ggml/src/ggml-hexagon/kernels/Makefile |   4 +-
 ggml/src/ggml-hexagon/kernels/stub.c   | 482 ++++++++++++-------------
 3 files changed, 363 insertions(+), 320 deletions(-)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index dd2d4f1b628ce..c4da5d0caec4c 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -172,6 +172,8 @@ struct ggml_backend_hexagon_context;
 #pragma weak remote_system_request
 #endif
 
+#define MAX_DOMAIN_NAMELEN 12
+
 #define CHECK_QNN_API(error, result)                                            \
     do {                                                                        \
         error = (result);                                                       \
@@ -201,6 +203,14 @@ using pfn_rpc_mem_deinit                        = void (*)(void);
 using pfn_rpc_mem_alloc                         = void *(*)(int, uint32_t, int);
 using pfn_rpc_mem_free                          = void (*)(void *);
 using pfn_rpc_mem_to_fd                         = int (*)(void *);
+using pfn_rpc_remote_handle_control             = int (*)(uint32_t, void*, uint32_t);
+using pfn_rpc_remote_register_buf               = int (*)(void*, int, int);
+using pfn_rpc_remote_session_control            = int (*)(uint32_t, void *, uint32_t);
+using pfn_rpc_remote_handle64_open              = int (*)(const char*, remote_handle64 *);
+using pfn_rpc_remote_handle64_close             = int (*)(remote_handle64);
+using pfn_rpc_remote_handle64_invoke            = int (*)(remote_handle64, uint32_t, remote_arg *);
+using pfn_rpc_remote_handle64_control           = int (*)(remote_handle64, uint32_t, void*, uint32_t);
+
 using _pfn_QnnSaver_initialize                  = decltype(QnnSaver_initialize);
 using _pfn_QnnInterface_getProviders            = decltype(QnnInterface_getProviders);
 using _pfn_QnnSystemInterface_getProviders      = decltype(QnnSystemInterface_getProviders);
@@ -818,6 +828,21 @@ static_assert(std::size(ggmlhexagon_k_op_caps) == (static_cast<size_t>(GGML_OP_C
 static int32_t g_qnntensor_idx = 0; //ensure every QNN tensor name is unique
 static int32_t g_qnnopcfg_idx  = 0; //ensure every QNN opconfig name is unique
 
+// libcdsprpc.so function handles
+void * _rpc_lib_handle      = nullptr;
+static pfn_rpc_mem_alloc _pfn_rpc_mem_alloc = nullptr;
+static pfn_rpc_mem_free _pfn_rpc_mem_free = nullptr;
+static pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd = nullptr;
+static pfn_rpc_mem_init  _pfn_rpc_mem_init = nullptr;
+static pfn_rpc_mem_deinit _pfn_rpc_mem_deinit = nullptr;
+static pfn_rpc_remote_handle_control _pfn_rpc_remote_handle_control = nullptr;
+static pfn_rpc_remote_register_buf _pfn_rpc_remote_register_buf = nullptr;
+static pfn_rpc_remote_session_control _pfn_rpc_remote_session_control = nullptr;
+static pfn_rpc_remote_handle64_open _pfn_rpc_remote_handle64_open = nullptr;
+static pfn_rpc_remote_handle64_close _pfn_rpc_remote_handle64_close = nullptr;
+static pfn_rpc_remote_handle64_invoke _pfn_rpc_remote_handle64_invoke = nullptr;
+static pfn_rpc_remote_handle64_control _pfn_rpc_remote_handle64_control = nullptr;
+
 // =================================================================================================
 //  section-2: ggml-hexagon internal troubleshooting and profiler function/class
 // =================================================================================================
@@ -2743,11 +2768,8 @@ class qnn_instance {
     std::unordered_map<void *, Qnn_MemHandle_t> _qnn_rpc_buffer_to_handles;
 
     std::atomic_bool _rpcmem_initialized{false};
-    pfn_rpc_mem_alloc _pfn_rpc_mem_alloc;
-    pfn_rpc_mem_free _pfn_rpc_mem_free;
-    pfn_rpc_mem_to_fd _pfn_rpc_mem_to_fd;
-    pfn_rpc_mem_init  _pfn_rpc_mem_init;
-    pfn_rpc_mem_deinit _pfn_rpc_mem_deinit;
+
+private:
     std::unordered_map<void *, void *> _rpcmem_store_map;
     std::unordered_map<void *, size_t> _rpcmem_usage_map;
     size_t                             _rpcmem_usage    = 0;   // mempool usage in bytes
@@ -2755,7 +2777,6 @@ class qnn_instance {
 
     std::string _graph_name;
     HEXAGONBackend _device_id;
-    void * _rpc_lib_handle      = nullptr;
     bool       _enable_qnn_rpc  = false; //TODO:unknown issue with QNN RPC feature
 
     qnn_instance(const qnn_instance &) = delete;
@@ -3380,38 +3401,6 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
         }
     }
 
-#if defined(__ANDROID__) || defined(__linux__)
-    std::filesystem::path full_path(std::string(g_hexagon_appcfg.runtime_libpath) + "libcdsprpc.so");
-    //full_path /= std::filesystem::path("libcdsprpc.so").filename();
-    _rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
-    if (nullptr == _rpc_lib_handle) {
-        GGMLHEXAGON_LOG_WARN("failed to load %s from local file, trying to find in system libraries\n", full_path.c_str());
-        _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
-    }
-#else
-    _rpc_lib_handle = dlopen("libcdsprpc.dll", RTLD_NOW | RTLD_LOCAL);
-#endif
-    if (nullptr == _rpc_lib_handle) {
-        GGMLHEXAGON_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror());
-        return 7;
-    } else {
-        GGMLHEXAGON_LOG_DEBUG("load rpcmem lib successfully\n");
-        set_rpcmem_initialized(true);
-    }
-    _pfn_rpc_mem_init   = reinterpret_cast<pfn_rpc_mem_init>(dlsym(_rpc_lib_handle, "rpcmem_init"));
-    _pfn_rpc_mem_deinit = reinterpret_cast<pfn_rpc_mem_deinit>(dlsym(_rpc_lib_handle, "rpcmem_deinit"));
-    _pfn_rpc_mem_alloc  = reinterpret_cast<pfn_rpc_mem_alloc>(dlsym(_rpc_lib_handle,"rpcmem_alloc"));
-    _pfn_rpc_mem_free   = reinterpret_cast<pfn_rpc_mem_free>(dlsym(_rpc_lib_handle, "rpcmem_free"));
-    _pfn_rpc_mem_to_fd  = reinterpret_cast<pfn_rpc_mem_to_fd>(dlsym(_rpc_lib_handle,"rpcmem_to_fd"));
-    if (nullptr == _pfn_rpc_mem_alloc || nullptr == _pfn_rpc_mem_free || nullptr == _pfn_rpc_mem_to_fd) {
-        GGMLHEXAGON_LOG_WARN("unable to access symbols in QNN RPC lib, dlerror(): %s", dlerror());
-        dlclose(_rpc_lib_handle);
-        return 8;
-    }
-
-    if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy
-        _pfn_rpc_mem_init();
-
     std::vector<const QnnContext_Config_t *> temp_context_config;
     _qnn_interface.qnn_context_create(_qnn_backend_handle, _qnn_device_handle,
                                temp_context_config.empty() ? nullptr : temp_context_config.data(),
@@ -4765,7 +4754,7 @@ static bool ggmlhexagon_is_valid_domain_id(int domain_id, int compute_only) {
     return false;
 }
 
-static int ggmlhexagon_get_domains_info(const char * domain_type, int * num_domains, fastrpc_domain ** domains_info) {
+/*static int ggmlhexagon_get_domains_info(const char * domain_type, int * num_domains, fastrpc_domain ** domains_info) {
     int hexagon_err = AEE_SUCCESS;
     int ss_info     = 0;
     void * buffer   = nullptr;
@@ -4824,7 +4813,7 @@ static int ggmlhexagon_get_domains_info(const char * domain_type, int * num_doma
         free(req.sys.domains);
     }
     return hexagon_err;
-}
+}*/
 
 static int ggmlhexagon_get_dsp_support(int * domain) {
     int hexagon_error = AEE_SUCCESS;
@@ -4872,7 +4861,7 @@ static int ggmlhexagon_get_vtcm_info(int domain, uint32_t attr, uint32_t * capab
         goto bail;
     }
 
-    if (remote_handle_control) {
+    if (_pfn_rpc_remote_handle_control) {
         if (domain == HEXAGON_ADSP || domain == HEXAGON_CDSP) {
             /*
             * query the DSP for VTCM information
@@ -4882,7 +4871,7 @@ static int ggmlhexagon_get_vtcm_info(int domain, uint32_t attr, uint32_t * capab
             dsp_capability_vtcm_dsp.domain       = (uint32_t)domain;
             dsp_capability_vtcm_dsp.attribute_ID = attr;
             dsp_capability_vtcm_dsp.capability   = (uint32_t)0;
-            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp, sizeof(struct remote_dsp_capability));
+            hexagon_error = _pfn_rpc_remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_vtcm_dsp, sizeof(struct remote_dsp_capability));
             if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
                 GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device");
                 GGMLHEXAGON_LOG_DEBUG("running the use case without checking the capability");
@@ -4910,9 +4899,9 @@ static int ggmlhexagon_get_vtcm_info(int domain, uint32_t attr, uint32_t * capab
 
 static bool ggmlhexagon_is_unsignedpd_supported(int domain_id) {
     int hexagon_error = AEE_SUCCESS;
-    if (remote_handle_control) {
+    if (_pfn_rpc_remote_handle_control) {
         struct remote_dsp_capability dsp_capability_domain = {static_cast<uint32_t>(domain_id), UNSIGNED_PD_SUPPORT, 0};
-        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
+        hexagon_error = _pfn_rpc_remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_domain, sizeof(struct remote_dsp_capability));
         if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
             GGMLHEXAGON_LOG_WARN("FastRPC Capability API is not supported on this device. Falling back to signed pd");
             return false;
@@ -4941,7 +4930,7 @@ static bool ggmlhexagon_get_unsignedpd_support(void) {
 
 static bool ggmlhexagon_is_async_fastrpc_supported(int domain) {
     int hexagon_error = AEE_SUCCESS;
-    if (remote_handle_control) {
+    if (_pfn_rpc_remote_handle_control) {
         if (domain == HEXAGON_CDSP) {
             /*
             * Query the DSP for ASYNC_FASTRPC_SUPPORT information
@@ -4951,7 +4940,7 @@ static bool ggmlhexagon_is_async_fastrpc_supported(int domain) {
             dsp_capability_async_support.domain       = (uint32_t)domain;
             dsp_capability_async_support.attribute_ID = ASYNC_FASTRPC_SUPPORT;
             dsp_capability_async_support.capability   = (uint32_t)0;
-            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support, sizeof(struct remote_dsp_capability));
+            hexagon_error = _pfn_rpc_remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_async_support, sizeof(struct remote_dsp_capability));
             if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
                 GGMLHEXAGON_LOG_WARN("FastRPC Capability API is not supported on this device");
                 hexagon_error = AEE_SUCCESS;
@@ -4981,7 +4970,7 @@ static bool ggmlhexagon_is_async_fastrpc_supported(int domain) {
 static void ggmlhexagon_set_rpc_latency(remote_handle64 handle, int qos, int latency) {
     int hexagon_error = AEE_SUCCESS;
 
-    if (remote_handle_control) {
+    if (_pfn_rpc_remote_handle64_control) {
         struct remote_rpc_control_latency data;
 /*
         qos          |  latency
@@ -4991,7 +4980,7 @@ static void ggmlhexagon_set_rpc_latency(remote_handle64 handle, int qos, int lat
 */
         data.enable   = qos;
         data.latency  = latency;
-        hexagon_error = remote_handle64_control(handle, DSPRPC_CONTROL_LATENCY, (void*)&data, sizeof(data));
+        hexagon_error = _pfn_rpc_remote_handle64_control(handle, DSPRPC_CONTROL_LATENCY, (void*)&data, sizeof(data));
         if (hexagon_error != AEE_SUCCESS) {
             GGMLHEXAGON_LOG_WARN("failed with error 0x%x", hexagon_error);
             goto bail;
@@ -5010,7 +4999,7 @@ static void ggmlhexagon_set_rpc_latency(remote_handle64 handle, int qos, int lat
 static bool ggmlhexagon_is_status_notification_supported(int domain) {
     int hexagon_error = AEE_SUCCESS;
 
-    if (remote_handle_control) {
+    if (_pfn_rpc_remote_handle_control) {
         /*
         * Query the DSP for STATUS_NOTIFICATION_SUPPORT information
         * DSP User PD status notification Support
@@ -5019,7 +5008,7 @@ static bool ggmlhexagon_is_status_notification_supported(int domain) {
         dsp_capability_status_notification_support.domain       = (uint32_t)domain;
         dsp_capability_status_notification_support.attribute_ID = STATUS_NOTIFICATION_SUPPORT;
         dsp_capability_status_notification_support.capability   = (uint32_t)0;
-        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support, sizeof(struct remote_dsp_capability));
+        hexagon_error = _pfn_rpc_remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_status_notification_support, sizeof(struct remote_dsp_capability));
         if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
             GGMLHEXAGON_LOG_WARN("FastRPC Capability API is not supported on this device");
             hexagon_error = AEE_SUCCESS;
@@ -5051,7 +5040,7 @@ static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t attr, uint32_t
         goto bail;
     }
 
-    if (remote_handle_control) {
+    if (_pfn_rpc_remote_handle_control) {
         if (domain == HEXAGON_CDSP) {
             /*
             * Query the DSP for HMX SUPPORT information
@@ -5061,7 +5050,7 @@ static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t attr, uint32_t
             dsp_capability_hmx_dsp.domain       = (uint32_t)domain;
             dsp_capability_hmx_dsp.attribute_ID = attr;
             dsp_capability_hmx_dsp.capability   = (uint32_t)0;
-            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp, sizeof(struct remote_dsp_capability));
+            hexagon_error = _pfn_rpc_remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hmx_dsp, sizeof(struct remote_dsp_capability));
             if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
                 GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device");
                 hexagon_error = AEE_SUCCESS;
@@ -5090,7 +5079,7 @@ static int ggmlhexagon_get_hmx_support_info(int domain, uint32_t attr, uint32_t
 static int ggmlhexagon_get_hvx_arch_ver(int domain, uint32_t * capability) {
     int hexagon_error = AEE_SUCCESS;
     *capability = 0;
-    if(remote_handle_control) {
+    if(_pfn_rpc_remote_handle_control) {
         /*
         * Query the Hexagon processor architecture version information
         */
@@ -5098,7 +5087,7 @@ static int ggmlhexagon_get_hvx_arch_ver(int domain, uint32_t * capability) {
         dsp_capability_arch_ver.domain       = (uint32_t)domain;
         dsp_capability_arch_ver.attribute_ID = ARCH_VER;
         dsp_capability_arch_ver.capability   = (uint32_t)0;
-        hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_arch_ver, sizeof(struct remote_dsp_capability));
+        hexagon_error = _pfn_rpc_remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_arch_ver, sizeof(struct remote_dsp_capability));
         if ((hexagon_error & 0xFF) == (AEE_EUNSUPPORTEDAPI & 0xFF)) {
             GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device");
             hexagon_error = AEE_SUCCESS;
@@ -5134,7 +5123,7 @@ static int ggmlhexagon_get_hvx_support_info(int domain, uint32_t attr, uint32_t
         goto bail;
     }
 
-    if (remote_handle_control) {
+    if (_pfn_rpc_remote_handle_control) {
         if (domain == HEXAGON_CDSP) {
             /*
             * Query the DSP for HVX SUPPORT information
@@ -5144,7 +5133,7 @@ static int ggmlhexagon_get_hvx_support_info(int domain, uint32_t attr, uint32_t
             dsp_capability_hvx_dsp.domain       = (uint32_t)domain;
             dsp_capability_hvx_dsp.attribute_ID = attr;
             dsp_capability_hvx_dsp.capability   = (uint32_t)0;
-            hexagon_error = remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp, sizeof(struct remote_dsp_capability));
+            hexagon_error = _pfn_rpc_remote_handle_control(DSPRPC_GET_DSP_INFO, &dsp_capability_hvx_dsp, sizeof(struct remote_dsp_capability));
             if ((hexagon_error & 0xFF)==(AEE_EUNSUPPORTEDAPI & 0xFF)) {
                 GGMLHEXAGON_LOG_DEBUG("FastRPC Capability API is not supported on this device");
                 hexagon_error = AEE_SUCCESS;
@@ -5180,7 +5169,7 @@ static int ggmlhexagon_request_status_notifications(int domain_id, void * contex
 
     status_notification_support = ggmlhexagon_is_status_notification_supported(domain_id);
     if (status_notification_support) {
-        hexagon_error = remote_session_control(FASTRPC_REGISTER_STATUS_NOTIFICATIONS, (void*)&notif, sizeof(notif));
+        hexagon_error = _pfn_rpc_remote_session_control(FASTRPC_REGISTER_STATUS_NOTIFICATIONS, (void*)&notif, sizeof(notif));
         if (hexagon_error != AEE_SUCCESS) {
             GGMLHEXAGON_LOG_DEBUG("error 0x%x: remote_session_control failed to enable status notifications", hexagon_error);
         }
@@ -5192,8 +5181,6 @@ static int ggmlhexagon_request_status_notifications(int domain_id, void * contex
 }
 
 static int ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
-    throw std::runtime_error("Not implemented. Directly initialising RPC memory pool is not supported right now.");
-
     size_t candidate_size   = 0;
     uint8_t * rpc_buffer    = nullptr;
     size_t probe_slots[]    = {1024, 1536, 2000, 2048};
@@ -5203,13 +5190,13 @@ static int ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
         return 1;
 
     for (size_t idx = 0; idx < probe_counts; idx++) {
-        rpc_buffer = static_cast<uint8_t *>(rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB)));
+        rpc_buffer = static_cast<uint8_t *>(_pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS, (probe_slots[idx] * SIZE_IN_MB)));
         if (nullptr == rpc_buffer) {
             GGMLHEXAGON_LOG_DEBUG("alloc rpcmem %d (MiB) failure during probe rpc memory info, reason: %s\n", probe_slots[idx], strerror(errno));
             break;
         } else {
             candidate_size = probe_slots[idx];
-            rpcmem_free(rpc_buffer);
+            _pfn_rpc_mem_free(rpc_buffer);
             rpc_buffer = nullptr;
         }
     }
@@ -5223,7 +5210,7 @@ static int ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
         ctx->rpc_mempool_len = ctx->rpc_mempool_capacity - (8 * SIZE_IN_MB);
 
         //FIXME: it seems there is unknown issue with 2+ GiB memory pool
-        ctx->rpc_mempool = rpcmem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_TRY_MAP_STATIC, ctx->rpc_mempool_len);
+        ctx->rpc_mempool = _pfn_rpc_mem_alloc(RPCMEM_HEAP_ID_SYSTEM, RPCMEM_DEFAULT_FLAGS | RPCMEM_TRY_MAP_STATIC, ctx->rpc_mempool_len);
         if (nullptr == ctx->rpc_mempool) {
             GGMLHEXAGON_LOG_WARN("alloc rpc memorypool %d failed", ctx->rpc_mempool_len);
             return 2;
@@ -5232,23 +5219,21 @@ static int ggmlhexagon_init_rpcmempool(ggml_backend_hexagon_context * ctx) {
                                   ctx->rpc_mempool, ctx->rpc_mempool_len,
                                   ctx->rpc_mempool_len / SIZE_IN_MB);
         }
-        ctx->rpc_mempool_handle = rpcmem_to_fd(ctx->rpc_mempool);
+        ctx->rpc_mempool_handle = _pfn_rpc_mem_to_fd(ctx->rpc_mempool);
         GGMLHEXAGON_LOG_DEBUG("rpc mempool handle %d", ctx->rpc_mempool_handle);
-        remote_register_buf(ctx->rpc_mempool, ctx->rpc_mempool_len, ctx->rpc_mempool_handle);
+        _pfn_rpc_remote_register_buf(ctx->rpc_mempool, ctx->rpc_mempool_len, ctx->rpc_mempool_handle);
     }
 
     return 0;
 }
 
 static void ggmlhexagon_deinit_rpcmempool(ggml_backend_hexagon_context * ctx) {
-    throw std::runtime_error("Not implemented. Directly initialising RPC memory pool is not supported right now.");
-
     if ((g_hexagon_appcfg.hwaccel_approach == HWACCEL_CDSP) && (1 == g_hexagon_appcfg.enable_rpc_ion_mempool)) {
         if (ctx->rpc_mempool) {
             //deregister rpc memory pool
-            remote_register_buf(ctx->rpc_mempool, ctx->rpc_mempool_len, -1);
+            _pfn_rpc_remote_register_buf(ctx->rpc_mempool, ctx->rpc_mempool_len, -1);
             GGMLHEXAGON_LOG_DEBUG("free rpc mempool %p", ctx->rpc_mempool);
-            rpcmem_free(ctx->rpc_mempool);
+            _pfn_rpc_mem_free(ctx->rpc_mempool);
             ctx->rpc_mempool = nullptr;
             ctx->rpc_mempool_len = 0;
             ctx->rpc_mempool_capacity = 0;
@@ -5317,6 +5302,49 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     static std::mutex mutex;
     std::lock_guard<std::mutex> lock(mutex);
 
+#if defined(__ANDROID__) || defined(__linux__)
+    std::filesystem::path full_path(std::string(g_hexagon_appcfg.runtime_libpath) + "libcdsprpc.so");
+    //full_path /= std::filesystem::path("libcdsprpc.so").filename();
+    _rpc_lib_handle = dlopen(full_path.string().c_str(), RTLD_NOW | RTLD_LOCAL);
+    if (nullptr == _rpc_lib_handle) {
+        GGMLHEXAGON_LOG_WARN("failed to load %s from local file, trying to find in system libraries\n", full_path.c_str());
+        _rpc_lib_handle = dlopen("libcdsprpc.so", RTLD_NOW | RTLD_LOCAL);
+    }
+#else
+    _rpc_lib_handle = dlopen("libcdsprpc.dll", RTLD_NOW | RTLD_LOCAL);
+#endif
+
+    if (nullptr == _rpc_lib_handle) {
+        GGMLHEXAGON_LOG_WARN("failed to load qualcomm's rpc lib, error:%s\n", dlerror());
+        return 7;
+    } else {
+        GGMLHEXAGON_LOG_DEBUG("load rpcmem lib successfully\n");
+    }
+    _pfn_rpc_mem_init   = reinterpret_cast<pfn_rpc_mem_init>(dlsym(_rpc_lib_handle, "rpcmem_init"));
+    _pfn_rpc_mem_deinit = reinterpret_cast<pfn_rpc_mem_deinit>(dlsym(_rpc_lib_handle, "rpcmem_deinit"));
+    _pfn_rpc_mem_alloc  = reinterpret_cast<pfn_rpc_mem_alloc>(dlsym(_rpc_lib_handle,"rpcmem_alloc"));
+    _pfn_rpc_mem_free   = reinterpret_cast<pfn_rpc_mem_free>(dlsym(_rpc_lib_handle, "rpcmem_free"));
+    _pfn_rpc_mem_to_fd  = reinterpret_cast<pfn_rpc_mem_to_fd>(dlsym(_rpc_lib_handle,"rpcmem_to_fd"));
+    _pfn_rpc_remote_handle_control = reinterpret_cast<pfn_rpc_remote_handle_control>(dlsym(_rpc_lib_handle,"remote_handle_control"));
+    _pfn_rpc_remote_register_buf = reinterpret_cast<pfn_rpc_remote_register_buf>(dlsym(_rpc_lib_handle,"remote_register_buf"));
+    _pfn_rpc_remote_session_control = reinterpret_cast<pfn_rpc_remote_session_control>(dlsym(_rpc_lib_handle,"remote_session_control"));
+    _pfn_rpc_remote_handle64_open = reinterpret_cast<pfn_rpc_remote_handle64_open>(dlsym(_rpc_lib_handle,"remote_handle64_open"));
+    _pfn_rpc_remote_handle64_close = reinterpret_cast<pfn_rpc_remote_handle64_close>(dlsym(_rpc_lib_handle,"remote_handle64_close"));
+    _pfn_rpc_remote_handle64_invoke = reinterpret_cast<pfn_rpc_remote_handle64_invoke>(dlsym(_rpc_lib_handle,"remote_handle64_invoke"));
+    _pfn_rpc_remote_handle64_control = reinterpret_cast<pfn_rpc_remote_handle64_control>(dlsym(_rpc_lib_handle,"remote_handle64_control"));
+
+    if (nullptr == _pfn_rpc_mem_alloc ||
+        nullptr == _pfn_rpc_mem_free ||
+        nullptr == _pfn_rpc_mem_to_fd ||
+        nullptr == _pfn_rpc_remote_register_buf) {
+        GGMLHEXAGON_LOG_WARN("unable to access symbols in QNN RPC lib, dlerror(): %s", dlerror());
+        dlclose(_rpc_lib_handle);
+        return 8;
+    }
+
+    if (nullptr != _pfn_rpc_mem_init) // make Qualcomm's SoC based low-end phone happy
+        _pfn_rpc_mem_init();
+
     int hexagon_error               = AEE_SUCCESS;
 
     int domain_id                   = HEXAGON_CDSP;
@@ -5326,7 +5354,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     bool is_unsignedpd_enabled      = false;
     int use_logical_id              = 0;
     int core_id                     = -1;
-    fastrpc_domain * domains_info   = NULL;
+    //fastrpc_domain * domains_info   = NULL;
     int num_domains                 = -1;
 
     domain * my_domain              = NULL;
@@ -5344,7 +5372,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     }
     ctx->ggmlop_handle = 0;
 
-    if (-1 == domain_id) {
+    /*if (-1 == domain_id) {
         if (nullptr != domain_type) {
             if ((strcmp(domain_type, "NSP") != 0 && strcmp(domain_type, "HPASS") != 0)) {
                 GGMLHEXAGON_LOG_WARN("invalid domain_type %s. possible values are NSP or HPASS", domain_type);
@@ -5381,7 +5409,7 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
                 GGMLHEXAGON_LOG_DEBUG("error: 0x%x, defaulting to cDSP domain", hexagon_error);
             }
         }
-    }
+    }*/
 
     if (0 == use_logical_id) {
         if (!ggmlhexagon_is_valid_domain_id(domain_id, 0)) {
@@ -5411,11 +5439,11 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
     GGMLHEXAGON_LOG_INFO("using Hexagon domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
     GGMLHEXAGON_LOG_INFO("unsignedpd_enabled %d", is_unsignedpd_enabled);
     if (is_unsignedpd_enabled) {
-        if (remote_session_control) {
+        if (_pfn_rpc_remote_session_control) {
             struct remote_rpc_control_unsigned_module data;
             data.enable = 1;
             data.domain = domain_id;
-            hexagon_error = remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *)&data, sizeof(data));
+            hexagon_error = _pfn_rpc_remote_session_control(DSPRPC_CONTROL_UNSIGNED_MODULE, (void *)&data, sizeof(data));
             GGMLHEXAGON_LOG_DEBUG("remote_session_control returned %d for configuring unsigned PD success", hexagon_error);
             if (AEE_SUCCESS != hexagon_error) {
                 GGMLHEXAGON_LOG_DEBUG("error 0x%x: remote_session_control failed", hexagon_error);
@@ -6147,7 +6175,7 @@ static void ggml_backend_hexagon_device_get_memory(ggml_backend_dev_t dev, size_
         //TODO: probe GPU info in Qualcomm Adreno GPU
         *total = ggmlhexagon_get_system_total_memory_in_bytes();
         *free = ggmlhexagon_get_system_free_memory_in_bytes();
-    } else if (HEXAGON_BACKEND_QNNNPU == ctx->device) {
+    } else if (HEXAGON_BACKEND_QNNNPU == ctx->device || HEXAGON_BACKEND_CDSP == ctx->device) {
         size_t rpc_ion_memsize = 0;
         size_t rpc_ion_usage   = 0;
         if (HWACCEL_CDSP != g_hexagon_appcfg.hwaccel_approach) {
@@ -6248,7 +6276,7 @@ static ggml_backend_buffer_type_t ggml_backend_hexagon_buffer_type(size_t device
         // TODO: not sure why we need to update the global setting here in the original code
         // it seems this code is reached when we allocate buffers for all devices (including the qnn-cpu device)
         // so if it reaches this code, then it won't use the NPU anymore since the backend config will be updated to use the cpu device
-        // g_hexagon_appcfg.hexagon_backend = device_index;
+        g_hexagon_appcfg.hexagon_backend = device_index;
     }
 
     static struct ggml_backend_buffer_type ggml_backend_hexagon_buffer_types[GGML_HEXAGON_MAX_DEVICES];
@@ -6728,3 +6756,22 @@ ggml_backend_t ggml_backend_hexagon_init(size_t device, const char * runtime_lib
 }
 
 GGML_BACKEND_DL_IMPL(ggml_backend_hexagon_reg)
+
+// =================================================================================================
+//  section-9: stub of remote cdsp functions
+// =================================================================================================
+
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle64_open)( __QAIC_IN_CHAR  const char* name, __QAIC_OUT  remote_handle64 *ph) __QAIC_REMOTE_ATTRIBUTE
+{
+    return _pfn_rpc_remote_handle64_open(name, ph);
+}
+
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle64_close)(__QAIC_IN remote_handle64 h) __QAIC_REMOTE_ATTRIBUTE
+{
+    return _pfn_rpc_remote_handle64_close(h);
+}
+
+__QAIC_REMOTE_EXPORT __QAIC_RETURN int __QAIC_REMOTE(remote_handle64_invoke)(__QAIC_IN remote_handle64 h, __QAIC_IN uint32_t dwScalars, __QAIC_IN remote_arg *pra) __QAIC_REMOTE_ATTRIBUTE
+{
+    return _pfn_rpc_remote_handle64_invoke(h, dwScalars, pra);
+}
diff --git a/ggml/src/ggml-hexagon/kernels/Makefile b/ggml/src/ggml-hexagon/kernels/Makefile
index 0e6b3fa2e4df6..c762f8bdd7901 100755
--- a/ggml/src/ggml-hexagon/kernels/Makefile
+++ b/ggml/src/ggml-hexagon/kernels/Makefile
@@ -4,8 +4,8 @@
 #HEXAGON_SDK_PATH=/opt/qcom/Hexagon_SDK/6.2.0.1
 
 HEXAGON_COMPUTE=compute${HTP_ARCH_VERSION}
-HEXAGON_CC=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang
-HEXAGON_CXX=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.8.06/Tools/bin/hexagon-clang
+HEXAGON_CC=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.7.06/Tools/bin/hexagon-clang
+HEXAGON_CXX=${HEXAGON_SDK_PATH}/tools/HEXAGON_Tools/8.7.06/Tools/bin/hexagon-clang
 
 TARGET=libggmlop-skel.so
 
diff --git a/ggml/src/ggml-hexagon/kernels/stub.c b/ggml/src/ggml-hexagon/kernels/stub.c
index 58cfd1d00eea6..a32ac180b0f37 100644
--- a/ggml/src/ggml-hexagon/kernels/stub.c
+++ b/ggml/src/ggml-hexagon/kernels/stub.c
@@ -14,73 +14,73 @@
 
 typedef struct _heap _heap;
 struct _heap {
-   _heap* pPrev;
-   const char* loc;
-   uint64_t buf;
+    _heap* pPrev;
+    const char* loc;
+    uint64_t buf;
 };
 
 typedef struct _allocator {
-   _heap* pheap;
-   uint8_t* stack;
-   uint8_t* stackEnd;
-   int nSize;
+    _heap* pheap;
+    uint8_t* stack;
+    uint8_t* stackEnd;
+    int nSize;
 } _allocator;
 
 _ATTRIBUTE_UNUSED
 static __inline int _heap_alloc(_heap** ppa, const char* loc, int size, void** ppbuf) {
-   _heap* pn = 0;
-   pn = MALLOC((size_t)size + sizeof(_heap) - sizeof(uint64_t));
-   if(pn != 0) {
-      pn->pPrev = *ppa;
-      pn->loc = loc;
-      *ppa = pn;
-      *ppbuf = (void*)&(pn->buf);
-      return 0;
-   } else {
-      return -1;
-   }
+    _heap* pn = 0;
+    pn = MALLOC((size_t)size + sizeof(_heap) - sizeof(uint64_t));
+    if(pn != 0) {
+        pn->pPrev = *ppa;
+        pn->loc = loc;
+        *ppa = pn;
+        *ppbuf = (void*)&(pn->buf);
+        return 0;
+    } else {
+        return -1;
+    }
 }
 #define _ALIGN_SIZE(x, y) (((x) + (y-1)) & ~(y-1))
 
 _ATTRIBUTE_UNUSED
 static __inline int _allocator_alloc(_allocator* me,
-                                    const char* loc,
-                                    int size,
-                                    unsigned int al,
-                                    void** ppbuf) {
-   if(size < 0) {
-      return -1;
-   } else if (size == 0) {
-      *ppbuf = 0;
-      return 0;
-   }
-   if((_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + (size_t)size) < (uintptr_t)me->stack + (size_t)me->nSize) {
-      *ppbuf = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al);
-      me->stackEnd = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + size;
-      return 0;
-   } else {
-      return _heap_alloc(&me->pheap, loc, size, ppbuf);
-   }
+                                     const char* loc,
+                                     int size,
+                                     unsigned int al,
+                                     void** ppbuf) {
+    if(size < 0) {
+        return -1;
+    } else if (size == 0) {
+        *ppbuf = 0;
+        return 0;
+    }
+    if((_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + (size_t)size) < (uintptr_t)me->stack + (size_t)me->nSize) {
+        *ppbuf = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al);
+        me->stackEnd = (uint8_t*)_ALIGN_SIZE((uintptr_t)me->stackEnd, al) + size;
+        return 0;
+    } else {
+        return _heap_alloc(&me->pheap, loc, size, ppbuf);
+    }
 }
 
 _ATTRIBUTE_UNUSED
 static __inline void _allocator_deinit(_allocator* me) {
-   _heap* pa = me->pheap;
-   while(pa != 0) {
-      _heap* pn = pa;
-      const char* loc = pn->loc;
-      (void)loc;
-      pa = pn->pPrev;
-      FREE(pn);
-   }
+    _heap* pa = me->pheap;
+    while(pa != 0) {
+        _heap* pn = pa;
+        const char* loc = pn->loc;
+        (void)loc;
+        pa = pn->pPrev;
+        FREE(pn);
+    }
 }
 
 _ATTRIBUTE_UNUSED
 static __inline void _allocator_init(_allocator* me, uint8_t* stack, int stackSize) {
-   me->stack =  stack;
-   me->stackEnd =  stack + stackSize;
-   me->nSize = stackSize;
-   me->pheap = 0;
+    me->stack =  stack;
+    me->stackEnd =  stack + stackSize;
+    me->nSize = stackSize;
+    me->pheap = 0;
 }
 
 
@@ -165,15 +165,15 @@ typedef struct UnionType UnionType;
 typedef struct StructType StructType;
 typedef struct SequenceType SequenceType;
 struct Type {
-   INHERIT_TYPE;
+    INHERIT_TYPE;
 };
 
 struct SequenceType {
-   const Type *         seqType;
-   uint32_t               nMaxLen;
-   uint32_t               inSize;
-   uint32_t               routSizePrimIn;
-   uint32_t               routSizePrimROut;
+    const Type *         seqType;
+    uint32_t               nMaxLen;
+    uint32_t               inSize;
+    uint32_t               routSizePrimIn;
+    uint32_t               routSizePrimROut;
 };
 
 //byte offset from the start of the case values for
@@ -185,49 +185,49 @@ struct SequenceType {
 //can be used directly to find the correct case
 typedef union CaseValuePtr CaseValuePtr;
 union CaseValuePtr {
-   const uint8_t*   value8s;
-   const uint16_t*  value16s;
-   const uint32_t*  value32s;
-   const uint64_t*  value64s;
+    const uint8_t*   value8s;
+    const uint16_t*  value16s;
+    const uint32_t*  value32s;
+    const uint64_t*  value64s;
 };
 
 //these are only used in complex cases
 //so I pulled them out of the type definition as references to make
 //the type smaller
 struct UnionType {
-   const Type           *descriptor;
-   uint32_t               nCases;
-   const CaseValuePtr   caseValues;
-   const Type * const   *cases;
-   int32_t               inSize;
-   int32_t               routSizePrimIn;
-   int32_t               routSizePrimROut;
-   uint8_t                inAlignment;
-   uint8_t                routAlignmentPrimIn;
-   uint8_t                routAlignmentPrimROut;
-   uint8_t                inCaseAlignment;
-   uint8_t                routCaseAlignmentPrimIn;
-   uint8_t                routCaseAlignmentPrimROut;
-   uint8_t                nativeCaseAlignment;
-   uint8_t              bDefaultCase;
+    const Type           *descriptor;
+    uint32_t               nCases;
+    const CaseValuePtr   caseValues;
+    const Type * const   *cases;
+    int32_t               inSize;
+    int32_t               routSizePrimIn;
+    int32_t               routSizePrimROut;
+    uint8_t                inAlignment;
+    uint8_t                routAlignmentPrimIn;
+    uint8_t                routAlignmentPrimROut;
+    uint8_t                inCaseAlignment;
+    uint8_t                routCaseAlignmentPrimIn;
+    uint8_t                routCaseAlignmentPrimROut;
+    uint8_t                nativeCaseAlignment;
+    uint8_t              bDefaultCase;
 };
 
 struct StructType {
-   uint32_t               nMembers;
-   const Type * const   *members;
-   int32_t               inSize;
-   int32_t               routSizePrimIn;
-   int32_t               routSizePrimROut;
-   uint8_t                inAlignment;
-   uint8_t                routAlignmentPrimIn;
-   uint8_t                routAlignmentPrimROut;
+    uint32_t               nMembers;
+    const Type * const   *members;
+    int32_t               inSize;
+    int32_t               routSizePrimIn;
+    int32_t               routSizePrimROut;
+    uint8_t                inAlignment;
+    uint8_t                routAlignmentPrimIn;
+    uint8_t                routAlignmentPrimROut;
 };
 
 typedef struct Parameter Parameter;
 struct Parameter {
-   INHERIT_TYPE;
-   uint8_t    mode;
-   uint8_t  bNotNil;
+    INHERIT_TYPE;
+    uint8_t    mode;
+    uint8_t  bNotNil;
 };
 
 #define SLIM_IFPTR32(is32,is64) (sizeof(uintptr_t) == 4 ? (is32) : (is64))
@@ -235,26 +235,26 @@ struct Parameter {
 
 typedef struct Method Method;
 struct Method {
-   uint32_t                    uScalars;            //no method index
-   int32_t                     primInSize;
-   int32_t                     primROutSize;
-   int                         maxArgs;
-   int                         numParams;
-   const Parameter * const     *params;
-   uint8_t                       primInAlignment;
-   uint8_t                       primROutAlignment;
+    uint32_t                    uScalars;            //no method index
+    int32_t                     primInSize;
+    int32_t                     primROutSize;
+    int                         maxArgs;
+    int                         numParams;
+    const Parameter * const     *params;
+    uint8_t                       primInAlignment;
+    uint8_t                       primROutAlignment;
 };
 
 typedef struct Interface Interface;
 
 struct Interface {
-   int                            nMethods;
-   const Method  * const          *methodArray;
-   int                            nIIds;
-   const uint32_t                   *iids;
-   const uint16_t*                  methodStringArray;
-   const uint16_t*                  methodStrings;
-   const char*                    strings;
+    int                            nMethods;
+    const Method  * const          *methodArray;
+    int                            nIIds;
+    const uint32_t                   *iids;
+    const uint16_t*                  methodStringArray;
+    const uint16_t*                  methodStrings;
+    const char*                    strings;
 };
 
 
@@ -291,177 +291,173 @@ __QAIC_SLIM_EXPORT const Interface __QAIC_SLIM(ggmlop_slim) = {8,&(methodArrays[
 extern "C" {
 #endif
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_open)(const char* uri, remote_handle64* h) __QAIC_STUB_ATTRIBUTE {
-    return -1;  // don't support direct dsp calls yet
-   //return __QAIC_REMOTE(remote_handle64_open)(uri, h);
+    return __QAIC_REMOTE(remote_handle64_open)(uri, h);
 }
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_close)(remote_handle64 h) __QAIC_STUB_ATTRIBUTE {
-    return -1;  // don't support direct dsp calls yet
-    //return __QAIC_REMOTE(remote_handle64_close)(h);
+    return __QAIC_REMOTE(remote_handle64_close)(h);
 }
 static __inline int _stub_method(remote_handle64 _handle, uint32_t _mid, uint32_t _in0[1], uint32_t _in1[1], uint32_t _in2[1], uint32_t _in3[1]) {
-   remote_arg _pra[1] = {0};
-   uint32_t _primIn[4]= {0};
-   int _nErr = 0;
-   _pra[0].buf.pv = (void*)_primIn;
-   _pra[0].buf.nLen = sizeof(_primIn);
-   _COPY(_primIn, 0, _in0, 0, 4);
-   _COPY(_primIn, 4, _in1, 0, 4);
-   _COPY(_primIn, 8, _in2, 0, 4);
-   _COPY(_primIn, 12,_in3, 0, 4);
-   // TODO: we don't support direct dsp calls yet
-   //_TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _pra));
-   _CATCH_FARF(_nErr) {
-      _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _mid, __func__);
-   }
-   return _nErr;
+    remote_arg _pra[1] = {0};
+    uint32_t _primIn[4]= {0};
+    int _nErr = 0;
+    _pra[0].buf.pv = (void*)_primIn;
+    _pra[0].buf.nLen = sizeof(_primIn);
+    _COPY(_primIn, 0, _in0, 0, 4);
+    _COPY(_primIn, 4, _in1, 0, 4);
+    _COPY(_primIn, 8, _in2, 0, 4);
+    _COPY(_primIn, 12,_in3, 0, 4);
+    _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _pra));
+    _CATCH_FARF(_nErr) {
+    _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, 1, 0, 0, 0), _mid, __func__);
+}
+    return _nErr;
 }
 __QAIC_STUB_EXPORT AEEResult __QAIC_STUB(ggmlop_dsp_setclocks)(remote_handle64 _handle, int32 power_level, int32 latency, int32 dcvs_enable, int32 threads) __QAIC_STUB_ATTRIBUTE {
-   uint32_t _mid = 2;
-   return _stub_method(_handle, _mid, (uint32_t*)&power_level, (uint32_t*)&latency, (uint32_t*)&dcvs_enable, (uint32_t*)&threads);
+    uint32_t _mid = 2;
+    return _stub_method(_handle, _mid, (uint32_t*)&power_level, (uint32_t*)&latency, (uint32_t*)&dcvs_enable, (uint32_t*)&threads);
 }
 static __inline int _stub_unpack(_ATTRIBUTE_UNUSED remote_arg* _praROutPost, _ATTRIBUTE_UNUSED remote_arg* _ppraROutPost[1], _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
-   int _nErr = 0;
-   remote_arg* _praROutPostStart = _praROutPost;
-   remote_arg** _ppraROutPostStart = _ppraROutPost;
-   _ppraROutPost = &_praROutPost;
-   _COPY(_rout0, 0, _primROut, 0, 4);
-   _COPY(_rout1, 0, _primROut, 4, 16);
-   _COPY(_rout2, 0, _primROut, 20, 16);
-   _COPY(_rout3, 0, _primROut, 36, 4);
-   _COPY(_rout4, 0, _primROut, 40, 64);
-   _COPY(_rout5, 0, _primROut, 104, 4);
-   _ppraROutPostStart[0] += (_praROutPost - _praROutPostStart) +1;
-   return _nErr;
+    int _nErr = 0;
+    remote_arg* _praROutPostStart = _praROutPost;
+    remote_arg** _ppraROutPostStart = _ppraROutPost;
+    _ppraROutPost = &_praROutPost;
+    _COPY(_rout0, 0, _primROut, 0, 4);
+    _COPY(_rout1, 0, _primROut, 4, 16);
+    _COPY(_rout2, 0, _primROut, 20, 16);
+    _COPY(_rout3, 0, _primROut, 36, 4);
+    _COPY(_rout4, 0, _primROut, 40, 64);
+    _COPY(_rout5, 0, _primROut, 104, 4);
+    _ppraROutPostStart[0] += (_praROutPost - _praROutPostStart) +1;
+    return _nErr;
 }
 static __inline int _stub_pack(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
-   int _nErr = 0;
-   remote_arg* _praInStart = _praIn;
-   remote_arg** _ppraInStart = _ppraIn;
-   remote_arg* _praROutStart = _praROut;
-   remote_arg** _ppraROutStart = _ppraROut;
-   _ppraIn = &_praIn;
-   _ppraROut = &_praROut;
-   _COPY(_primIn, 0, _rout6Len, 0, 4);
-   _praROut[0].buf.pv = _rout6[0];
-   _praROut[0].buf.nLen = (4 * _rout6Len[0]);
-   _ppraInStart[0] += (_praIn - _praInStart) + 0;
-   _ppraROutStart[0] += (_praROut - _praROutStart) +1;
-   return _nErr;
+    int _nErr = 0;
+    remote_arg* _praInStart = _praIn;
+    remote_arg** _ppraInStart = _ppraIn;
+    remote_arg* _praROutStart = _praROut;
+    remote_arg** _ppraROutStart = _ppraROut;
+    _ppraIn = &_praIn;
+    _ppraROut = &_praROut;
+    _COPY(_primIn, 0, _rout6Len, 0, 4);
+    _praROut[0].buf.pv = _rout6[0];
+    _praROut[0].buf.nLen = (4 * _rout6Len[0]);
+    _ppraInStart[0] += (_praIn - _praInStart) + 0;
+    _ppraROutStart[0] += (_praROut - _praROutStart) +1;
+    return _nErr;
 }
 static __inline int _stub_pack_1(_ATTRIBUTE_UNUSED _allocator* _al, _ATTRIBUTE_UNUSED remote_arg* _praIn, _ATTRIBUTE_UNUSED remote_arg* _ppraIn[1], _ATTRIBUTE_UNUSED remote_arg* _praROut, _ATTRIBUTE_UNUSED remote_arg* _ppraROut[1], _ATTRIBUTE_UNUSED remote_arg* _praHIn, _ATTRIBUTE_UNUSED remote_arg* _ppraHIn[1], _ATTRIBUTE_UNUSED remote_arg* _praHROut, _ATTRIBUTE_UNUSED remote_arg* _ppraHROut[1], _ATTRIBUTE_UNUSED void* _primIn, _ATTRIBUTE_UNUSED void* _primROut, _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[16], _ATTRIBUTE_UNUSED uint32_t _in5[1], _ATTRIBUTE_UNUSED char* _in6[1], _ATTRIBUTE_UNUSED uint32_t _in6Len[1]) {
-   int _nErr = 0;
-   remote_arg* _praInStart = _praIn;
-   remote_arg** _ppraInStart = _ppraIn;
-   remote_arg* _praROutStart = _praROut;
-   remote_arg** _ppraROutStart = _ppraROut;
-   _ppraIn = &_praIn;
-   _ppraROut = &_praROut;
-   _COPY(_primIn, 0, _in0, 0, 4);
-   _COPY(_primIn, 4, _in1, 0, 16);
-   _COPY(_primIn, 20, _in2, 0, 16);
-   _COPY(_primIn, 36, _in3, 0, 4);
-   _COPY(_primIn, 40, _in4, 0, 64);
-   _COPY(_primIn, 104, _in5, 0, 4);
-   _COPY(_primIn, 108, _in6Len, 0, 4);
-   _praIn[0].buf.pv = (void*) _in6[0];
-   _praIn[0].buf.nLen = (4 * _in6Len[0]);
-   _ppraInStart[0] += (_praIn - _praInStart) + 1;
-   _ppraROutStart[0] += (_praROut - _praROutStart) +0;
-   return _nErr;
+    int _nErr = 0;
+    remote_arg* _praInStart = _praIn;
+    remote_arg** _ppraInStart = _ppraIn;
+    remote_arg* _praROutStart = _praROut;
+    remote_arg** _ppraROutStart = _ppraROut;
+    _ppraIn = &_praIn;
+    _ppraROut = &_praROut;
+    _COPY(_primIn, 0, _in0, 0, 4);
+    _COPY(_primIn, 4, _in1, 0, 16);
+    _COPY(_primIn, 20, _in2, 0, 16);
+    _COPY(_primIn, 36, _in3, 0, 4);
+    _COPY(_primIn, 40, _in4, 0, 64);
+    _COPY(_primIn, 104, _in5, 0, 4);
+    _COPY(_primIn, 108, _in6Len, 0, 4);
+    _praIn[0].buf.pv = (void*) _in6[0];
+    _praIn[0].buf.nLen = (4 * _in6Len[0]);
+    _ppraInStart[0] += (_praIn - _praInStart) + 1;
+    _ppraROutStart[0] += (_praROut - _praROutStart) +0;
+    return _nErr;
 }
 static __inline void _count(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _rout0[1], _ATTRIBUTE_UNUSED uint32_t _rout1[4], _ATTRIBUTE_UNUSED uint32_t _rout2[4], _ATTRIBUTE_UNUSED uint32_t _rout3[1], _ATTRIBUTE_UNUSED uint32_t _rout4[16], _ATTRIBUTE_UNUSED uint32_t _rout5[1], _ATTRIBUTE_UNUSED char* _rout6[1], _ATTRIBUTE_UNUSED uint32_t _rout6Len[1]) {
-   _numIn[0] += 0;
-   _numROut[0] += 1;
-   _numInH[0] += 0;
-   _numROutH[0] += 0;
+    _numIn[0] += 0;
+    _numROut[0] += 1;
+    _numInH[0] += 0;
+    _numROutH[0] += 0;
 }
 static __inline void _count_1(int _numIn[1], int _numROut[1], int _numInH[1], int _numROutH[1], _ATTRIBUTE_UNUSED uint32_t _in0[1], _ATTRIBUTE_UNUSED uint32_t _in1[4], _ATTRIBUTE_UNUSED uint32_t _in2[4], _ATTRIBUTE_UNUSED uint32_t _in3[1], _ATTRIBUTE_UNUSED uint32_t _in4[16], _ATTRIBUTE_UNUSED uint32_t _in5[1], _ATTRIBUTE_UNUSED char* _in6[1], _ATTRIBUTE_UNUSED uint32_t _in6Len[1]) {
-   _numIn[0] += 1;
-   _numROut[0] += 0;
-   _numInH[0] += 0;
-   _numROutH[0] += 0;
+    _numIn[0] += 1;
+    _numROut[0] += 0;
+    _numInH[0] += 0;
+    _numROutH[0] += 0;
 }
 static __inline int _stub_method_1(remote_handle64 _handle, uint32_t _mid, uintptr_t _in0[SLIM_IFPTR32(29, 16)], uintptr_t _in1[SLIM_IFPTR32(29, 16)], uintptr_t _rout2[SLIM_IFPTR32(29, 16)]) {
-   remote_arg* _pra = 0;
-   int _numIn[1] = {0};
-   int _numROut[1] = {0};
-   int _numInH[1] = {0};
-   int _numROutH[1] = {0};
-   _allocator _al[1] = {{0}};
-   uint32_t _primIn[57]= {0};
-   uint32_t _primROut[27]= {0};
-   remote_arg* _praIn = 0;
-   remote_arg* _praROut = 0;
-   remote_arg* _praROutPost = 0;
-   remote_arg** _ppraROutPost = &_praROutPost;
-   remote_arg** _ppraIn = &_praIn;
-   remote_arg** _ppraROut = &_praROut;
-   remote_arg* _praHIn = 0;
-   remote_arg** _ppraHIn = &_praHIn;
-   remote_arg* _praHROut = 0;
-   remote_arg** _ppraHROut = &_praHROut;
-   int _nErr = 0;
-   _numIn[0] = 0;
-   _numROut[0] = 0;
-   _numInH[0] = 0;
-   _numROutH[0] = 0;
-   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30])));
-   _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30])));
-   _count(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30])));
-   if(_numIn[0]>=255){
-          return AEE_EUNSUPPORTED;
-   }
-   if(_numROut[0]>=255){
-          return AEE_EUNSUPPORTED;
-   }
-   _allocator_init(_al, 0, 0);
-   _QAIC_ALLOCATE(_nErr, _al, ((((((((_numIn[0] + _numROut[0]) + _numInH[0]) + _numROutH[0]) + 1) + 1) + 0) + 0) * sizeof(_pra[0])), 4, _pra);
-   _QAIC_ASSERT(_nErr, _pra);
-   _pra[0].buf.pv = (void*)_primIn;
-   _pra[0].buf.nLen = sizeof(_primIn);
-   _pra[(_numIn[0] + 1)].buf.pv = (void*)_primROut;
-   _pra[(_numIn[0] + 1)].buf.nLen = sizeof(_primROut);
-   _praIn = (_pra + 1);
-   _praROut = (_praIn + _numIn[0] + 1);
-   _praROutPost = _praROut;
-   if(_praHIn == 0)
-   {
-      _praHIn = ((_praROut + _numROut[0]) + 1);
-   }
-   if(_praHROut == 0)
-      (_praHROut = _praHIn + _numInH[0] + 0);
-   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30]))));
-   _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 112), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30]))));
-   _TRY(_nErr, _stub_pack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 224), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))));
-   _QAIC_ASSERT(_nErr, (_numInH[0] + 0) <= 15);
-   _QAIC_ASSERT(_nErr, (_numROutH[0] + 0) <= 15);
-   // TODO: we don't support direct dsp calls yet
-   //_TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _pra));
-   _TRY(_nErr, _stub_unpack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))));
-   _QAIC_CATCH(_nErr) {}
-   _CATCH_FARF(_nErr) {
-      _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _mid, __func__);
-   }
-   _allocator_deinit(_al);
-   return _nErr;
+    remote_arg* _pra = 0;
+    int _numIn[1] = {0};
+    int _numROut[1] = {0};
+    int _numInH[1] = {0};
+    int _numROutH[1] = {0};
+    _allocator _al[1] = {{0}};
+    uint32_t _primIn[57]= {0};
+    uint32_t _primROut[27]= {0};
+    remote_arg* _praIn = 0;
+    remote_arg* _praROut = 0;
+    remote_arg* _praROutPost = 0;
+    remote_arg** _ppraROutPost = &_praROutPost;
+    remote_arg** _ppraIn = &_praIn;
+    remote_arg** _ppraROut = &_praROut;
+    remote_arg* _praHIn = 0;
+    remote_arg** _ppraHIn = &_praHIn;
+    remote_arg* _praHROut = 0;
+    remote_arg** _ppraHROut = &_praHROut;
+    int _nErr = 0;
+    _numIn[0] = 0;
+    _numROut[0] = 0;
+    _numInH[0] = 0;
+    _numROutH[0] = 0;
+    _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30])));
+    _count_1(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30])));
+    _count(_numIn, _numROut, _numInH, _numROutH, (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30])));
+    if(_numIn[0]>=255){
+        return AEE_EUNSUPPORTED;
+    }
+    if(_numROut[0]>=255){
+        return AEE_EUNSUPPORTED;
+    }
+    _allocator_init(_al, 0, 0);
+    _QAIC_ALLOCATE(_nErr, _al, ((((((((_numIn[0] + _numROut[0]) + _numInH[0]) + _numROutH[0]) + 1) + 1) + 0) + 0) * sizeof(_pra[0])), 4, _pra);
+    _QAIC_ASSERT(_nErr, _pra);
+    _pra[0].buf.pv = (void*)_primIn;
+    _pra[0].buf.nLen = sizeof(_primIn);
+    _pra[(_numIn[0] + 1)].buf.pv = (void*)_primROut;
+    _pra[(_numIn[0] + 1)].buf.nLen = sizeof(_primROut);
+    _praIn = (_pra + 1);
+    _praROut = (_praIn + _numIn[0] + 1);
+    _praROutPost = _praROut;
+    if(_praHIn == 0)
+    {
+        _praHIn = ((_praROut + _numROut[0]) + 1);
+    }
+    if(_praHROut == 0)
+        (_praHROut = _praHIn + _numInH[0] + 0);
+    _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 0), 0, (uint32_t*)&(((uint32_t*)_in0)[0]), (uint32_t*)&(((uint32_t*)_in0)[1]), (uint32_t*)&(((uint32_t*)_in0)[5]), (uint32_t*)&(((uint32_t*)_in0)[9]), (uint32_t*)&(((uint32_t*)_in0)[10]), (uint32_t*)&(((uint32_t*)_in0)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in0)[27]), (char**)&(((uint64_t*)_in0)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in0)[28]), (uint32_t*)&(((uint32_t*)_in0)[30]))));
+    _TRY(_nErr, _stub_pack_1(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 112), 0, (uint32_t*)&(((uint32_t*)_in1)[0]), (uint32_t*)&(((uint32_t*)_in1)[1]), (uint32_t*)&(((uint32_t*)_in1)[5]), (uint32_t*)&(((uint32_t*)_in1)[9]), (uint32_t*)&(((uint32_t*)_in1)[10]), (uint32_t*)&(((uint32_t*)_in1)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_in1)[27]), (char**)&(((uint64_t*)_in1)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_in1)[28]), (uint32_t*)&(((uint32_t*)_in1)[30]))));
+    _TRY(_nErr, _stub_pack(_al, (_praIn + 0), _ppraIn, (_praROut + 0), _ppraROut, _praHIn, _ppraHIn, _praHROut, _ppraHROut, ((char*)_primIn + 224), ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))));
+    _QAIC_ASSERT(_nErr, (_numInH[0] + 0) <= 15);
+    _QAIC_ASSERT(_nErr, (_numROutH[0] + 0) <= 15);
+    _TRY_FARF(_nErr, __QAIC_REMOTE(remote_handle64_invoke)(_handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _pra));
+    _TRY(_nErr, _stub_unpack((_praROutPost + 0), _ppraROutPost, ((char*)_primROut + 0), (uint32_t*)&(((uint32_t*)_rout2)[0]), (uint32_t*)&(((uint32_t*)_rout2)[1]), (uint32_t*)&(((uint32_t*)_rout2)[5]), (uint32_t*)&(((uint32_t*)_rout2)[9]), (uint32_t*)&(((uint32_t*)_rout2)[10]), (uint32_t*)&(((uint32_t*)_rout2)[26]), SLIM_IFPTR32((char**)&(((uint32_t*)_rout2)[27]), (char**)&(((uint64_t*)_rout2)[14])), SLIM_IFPTR32((uint32_t*)&(((uint32_t*)_rout2)[28]), (uint32_t*)&(((uint32_t*)_rout2)[30]))));
+    _QAIC_CATCH(_nErr) {}
+    _CATCH_FARF(_nErr) {
+    _QAIC_FARF(RUNTIME_ERROR, "ERROR 0x%x: handle=0x%"PRIx64", scalar=0x%x, method ID=%d: %s failed\n", _nErr , _handle, REMOTE_SCALARS_MAKEX(0, _mid, (_numIn[0] + 1), (_numROut[0] + 1), (_numInH[0] + 0), (_numROutH[0] + 0)), _mid, __func__);
+}
+    _allocator_deinit(_al);
+    return _nErr;
 }
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_add)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
-   uint32_t _mid = 3;
-   return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
+    uint32_t _mid = 3;
+    return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
 }
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_mulmat)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
-   uint32_t _mid = 4;
-   return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
+    uint32_t _mid = 4;
+    return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
 }
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_softmax)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
-   uint32_t _mid = 5;
-   return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
+    uint32_t _mid = 5;
+    return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
 }
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_rmsnorm)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
-   uint32_t _mid = 6;
-   return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
+    uint32_t _mid = 6;
+    return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
 }
 __QAIC_STUB_EXPORT int __QAIC_STUB(ggmlop_dsp_pool2d)(remote_handle64 _handle, const dsptensor* src0, const dsptensor* src1, dsptensor* dst) __QAIC_STUB_ATTRIBUTE {
-   uint32_t _mid = 7;
-   return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
+    uint32_t _mid = 7;
+    return _stub_method_1(_handle, _mid, (uintptr_t*)src0, (uintptr_t*)src1, (uintptr_t*)dst);
 }

From 6d8ad6fa295a07c8556c26f7e1a23335e061eab5 Mon Sep 17 00:00:00 2001
From: l3utterfly <gc.pthzfoldr@gmail.com>
Date: Sat, 17 May 2025 22:01:29 +0800
Subject: [PATCH 200/200] Implement file management for libggmlop-skel.so based
 on DSP architecture version in ggml-hexagon backend

---
 ggml/src/ggml-hexagon/ggml-hexagon.cpp | 51 ++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/ggml/src/ggml-hexagon/ggml-hexagon.cpp b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
index c4da5d0caec4c..f8265baa16205 100644
--- a/ggml/src/ggml-hexagon/ggml-hexagon.cpp
+++ b/ggml/src/ggml-hexagon/ggml-hexagon.cpp
@@ -15,6 +15,7 @@
  * section-6  implementation of hwaccel approach through QNN: offload ggmlop to QNN
  * section-7  cDSP helper function
  * section-8  implementation of ggml-hexagon backend according to specification in ggml backend subsystem
+ * section-9  implementations of various stub methods for libcdsprpc.so
  *
  * currently provide following ggml op' implementation through QNN:
  * - GGML_OP_ADD/GGML_OP_SUB/GGML_OP_MUL/GGML_OP_DIV/GGML_OP_LOG/GGML_OP_SQRT:
@@ -5464,6 +5465,54 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
         goto bail;
     }
 
+    {
+        uint32_t dsp_version = 0;
+        ggmlhexagon_get_hvx_arch_ver(ctx->domain_id, &dsp_version);
+
+        if (dsp_version == 0x68 || dsp_version == 0x69 || dsp_version == 0x73 ||
+            dsp_version == 0x75 || dsp_version == 0x79) {
+
+            // delete the file $(g_hexagon_appcfg.runtime_libpath)/libggmlop-skel.so if it exists
+            std::string filepath = std::string(g_hexagon_appcfg.runtime_libpath) + "/libggmlop-skel.so";
+            if (std::filesystem::exists(filepath)) {
+                std::filesystem::remove(filepath);
+            }
+
+            // detect the htp arch number
+            size_t htp_arch = ggmlhexagon_htparch_hex_to_decimal(dsp_version);
+
+            // find the file $(g_hexagon_appcfg.runtime_libpath)/libggmlop-skelV$(htp_arch).so if it exists
+            // copy and rename it to libggmlop-skel.so in the same folder
+
+            // Construct file paths
+            std::string source_filename = std::string("libggmlop-skelV") + std::to_string(htp_arch) + ".so";
+            std::string source_path = std::string(g_hexagon_appcfg.runtime_libpath) + "/" + source_filename;
+            std::string dest_path = std::string(g_hexagon_appcfg.runtime_libpath) + "/libggmlop-skel.so";
+
+            // Check if source file exists
+            if (std::filesystem::exists(source_path)) {
+                // Copy and rename the file
+                try {
+                    std::filesystem::copy_file(
+                            source_path,
+                            dest_path,
+                            std::filesystem::copy_options::overwrite_existing
+                    );
+                } catch (const std::filesystem::filesystem_error& e) {
+                    // Handle error
+                    GGMLHEXAGON_LOG_WARN("Error copying file: %s", e.what());
+                    goto bail;
+                }
+            } else {
+                GGMLHEXAGON_LOG_WARN("Error finding skel library: %s", source_path.c_str());
+                goto bail;
+            }
+        } else {
+            GGMLHEXAGON_LOG_WARN("error: dsp arch version 0x%x is not supported", dsp_version);
+            goto bail;
+        }
+    }
+
     ggmlop_domain_uri_len   = strlen(ggmlop_URI) + MAX_DOMAIN_NAMELEN;
     ggmlop_domain_uri       = (char *)malloc(ggmlop_domain_uri_len);
     snprintf(ggmlop_domain_uri, ggmlop_domain_uri_len, "%s%s", ggmlop_URI, uri);
@@ -5473,7 +5522,9 @@ static int ggmlhexagon_init_dsp(ggml_backend_hexagon_context * ctx) {
         GGMLHEXAGON_LOG_INFO("succeed to open domain %d(%s)", domain_id, ggmlhexagon_get_dsp_name(domain_id));
         //FIXME: only support offload fp32 GGML_OP_MUL_MAT to cDSP
         GGMLHEXAGON_LOG_INFO("only support offload fp32 GGML_OP_ADD and fp32 GGML_OP_MUL_MAT to cDSP currently");
+
         ggmlhexagon_probe_dspinfo(ctx);
+
         //FIXME: re-use this function to pass thread_counts info to code on cDSP side before fully understand qidl mechanism
         ggmlop_dsp_setclocks(ctx->ggmlop_handle, HAP_DCVS_VCORNER_TURBO_PLUS, 40, 1, g_hexagon_appcfg.thread_counts);
         ggmlhexagon_set_rpc_latency(ctx->ggmlop_handle, RPC_POLL_QOS, 100);