From 97a0d52dab07cbaf5ada2e4381d8e7fa9e8152f8 Mon Sep 17 00:00:00 2001
From: leejet <leejet714@gmail.com>
Date: Fri, 7 Nov 2025 22:55:48 +0800
Subject: [PATCH] fix: compatibility for models with modified tensor shapes

---
 common.hpp      | 16 ++++++++++++++++
 ggml_extend.hpp |  2 +-
 vae.hpp         | 19 +++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)
diff --git a/common.hpp b/common.hpp
index 59540752c..c68ddafe5 100644
--- a/common.hpp
+++ b/common.hpp
@@ -410,6 +410,22 @@ class SpatialTransformer : public GGMLBlock {
     int64_t context_dim = 768;  // hidden_size, 1024 for VERSION_SD2
     bool use_linear     = false;
 
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
+        auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
+        if (iter != tensor_storage_map.end()) {
+            int64_t inner_dim = n_head * d_head;
+            if (iter->second.n_dims == 4 && use_linear) {
+                use_linear         = false;
+                blocks["proj_in"]  = std::make_shared<Conv2d>(in_channels, inner_dim, std::pair{1, 1});
+                blocks["proj_out"] = std::make_shared<Conv2d>(inner_dim, in_channels, std::pair{1, 1});
+            } else if (iter->second.n_dims == 2 && !use_linear) {
+                use_linear         = true;
+                blocks["proj_in"]  = std::make_shared<Linear>(in_channels, inner_dim);
+                blocks["proj_out"] = std::make_shared<Linear>(inner_dim, in_channels);
+            }
+        }
+    }
+
 public:
     SpatialTransformer(int64_t in_channels,
                        int64_t n_head,
diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index d11e07a15..ac6a2ccc2 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -1926,8 +1926,8 @@ class GGMLBlock {
         if (prefix.size() > 0) {
             prefix = prefix + ".";
         }
-        init_blocks(ctx, tensor_storage_map, prefix);
         init_params(ctx, tensor_storage_map, prefix);
+        init_blocks(ctx, tensor_storage_map, prefix);
     }
 
     size_t get_params_num() {
diff --git a/vae.hpp b/vae.hpp
index ddf970c9a..9fc8fb75a 100644
--- a/vae.hpp
+++ b/vae.hpp
@@ -66,6 +66,25 @@ class AttnBlock : public UnaryBlock {
     int64_t in_channels;
     bool use_linear;
 
+    void init_params(struct ggml_context* ctx, const String2TensorStorage& tensor_storage_map = {}, const std::string prefix = "") {
+        auto iter = tensor_storage_map.find(prefix + "proj_out.weight");
+        if (iter != tensor_storage_map.end()) {
+            if (iter->second.n_dims == 4 && use_linear) {
+                use_linear         = false;
+                blocks["q"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+                blocks["k"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+                blocks["v"]        = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+                blocks["proj_out"] = std::make_shared<Conv2d>(in_channels, in_channels, std::pair{1, 1});
+            } else if (iter->second.n_dims == 2 && !use_linear) {
+                use_linear         = true;
+                blocks["q"]        = std::make_shared<Linear>(in_channels, in_channels);
+                blocks["k"]        = std::make_shared<Linear>(in_channels, in_channels);
+                blocks["v"]        = std::make_shared<Linear>(in_channels, in_channels);
+                blocks["proj_out"] = std::make_shared<Linear>(in_channels, in_channels);
+            }
+        }
+    }
+
 public:
     AttnBlock(int64_t in_channels, bool use_linear)
         : in_channels(in_channels), use_linear(use_linear) {