diff --git a/ggml_extend.hpp b/ggml_extend.hpp
index a125357b4..e01d41bfe 100644
--- a/ggml_extend.hpp
+++ b/ggml_extend.hpp
@@ -372,12 +372,14 @@ __STATIC_INLINE__ float sigmoid(float x) {
 
 // SPECIAL OPERATIONS WITH TENSORS
 
-__STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input) {
+__STATIC_INLINE__ uint8_t* sd_tensor_to_image(struct ggml_tensor* input, uint8_t* image_data = nullptr) {
     int64_t width    = input->ne[0];
     int64_t height   = input->ne[1];
     int64_t channels = input->ne[2];
     GGML_ASSERT(channels == 3 && input->type == GGML_TYPE_F32);
-    uint8_t* image_data = (uint8_t*)malloc(width * height * channels);
+    if (image_data == nullptr) {
+        image_data = (uint8_t*)malloc(width * height * channels);
+    }
     for (int iy = 0; iy < height; iy++) {
         for (int ix = 0; ix < width; ix++) {
             for (int k = 0; k < channels; k++) {
diff --git a/preprocessing.hpp b/preprocessing.hpp
index 9cace2f44..552aa6424 100644
--- a/preprocessing.hpp
+++ b/preprocessing.hpp
@@ -6,7 +6,7 @@
 
 void convolve(struct ggml_tensor* input, struct ggml_tensor* output, struct ggml_tensor* kernel, int padding) {
     struct ggml_init_params params;
-    params.mem_size                 = 20 * 1024 * 1024;  // 10
+    params.mem_size                 = 80 * input->ne[0] * input->ne[1];  // 20M for 512x512
     params.mem_buffer               = NULL;
     params.no_alloc                 = false;
     struct ggml_context* ctx0       = ggml_init(params);
@@ -164,7 +164,7 @@ void threshold_hystersis(struct ggml_tensor* img, float high_threshold, float lo
 
 bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold, float weak, float strong, bool inverse) {
     struct ggml_init_params params;
-    params.mem_size               = static_cast<size_t>(10 * 1024 * 1024);  // 10MB
+    params.mem_size               = static_cast<size_t>(40 * img.width * img.height);  // 10MB for 512x512
     params.mem_buffer             = NULL;
     params.no_alloc               = false;
     struct ggml_context* work_ctx = ggml_init(params);
@@ -218,9 +218,7 @@ bool preprocess_canny(sd_image_t img, float high_threshold, float low_threshold,
             ggml_tensor_set_f32(image, gray, ix, iy, 2);
         }
     }
-    uint8_t* output = sd_tensor_to_image(image);
-    free(img.data);
-    img.data = output;
+    sd_tensor_to_image(image, img.data);
     ggml_free(work_ctx);
     return true;
 }
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 62b40c6d0..68bfe9ac9 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -1434,7 +1434,7 @@ class StableDiffusionGGML {
             int ne3;
             if (sd_version_is_qwen_image(version)) {
                 ne2 = 1;
-                ne3 = C*x->ne[3];
+                ne3 = C * x->ne[3];
             } else {
                 if (!use_tiny_autoencoder) {
                     C *= 2;