diff --git a/ggml_extend.hpp b/ggml_extend.hpp index 9f7d0b39..a125357b 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -483,12 +483,15 @@ __STATIC_INLINE__ void ggml_split_tensor_2d(struct ggml_tensor* input, int64_t width = output->ne[0]; int64_t height = output->ne[1]; int64_t channels = output->ne[2]; + int64_t ne3 = output->ne[3]; GGML_ASSERT(input->type == GGML_TYPE_F32 && output->type == GGML_TYPE_F32); for (int iy = 0; iy < height; iy++) { for (int ix = 0; ix < width; ix++) { for (int k = 0; k < channels; k++) { - float value = ggml_tensor_get_f32(input, ix + x, iy + y, k); - ggml_tensor_set_f32(output, value, ix, iy, k); + for (int l = 0; l < ne3; l++) { + float value = ggml_tensor_get_f32(input, ix + x, iy + y, k, l); + ggml_tensor_set_f32(output, value, ix, iy, k, l); + } } } } @@ -511,6 +514,7 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input, int64_t width = input->ne[0]; int64_t height = input->ne[1]; int64_t channels = input->ne[2]; + int64_t ne3 = input->ne[3]; int64_t img_width = output->ne[0]; int64_t img_height = output->ne[1]; @@ -519,24 +523,26 @@ __STATIC_INLINE__ void ggml_merge_tensor_2d(struct ggml_tensor* input, for (int iy = y_skip; iy < height; iy++) { for (int ix = x_skip; ix < width; ix++) { for (int k = 0; k < channels; k++) { - float new_value = ggml_tensor_get_f32(input, ix, iy, k); - if (overlap_x > 0 || overlap_y > 0) { // blend colors in overlapped area - float old_value = ggml_tensor_get_f32(output, x + ix, y + iy, k); - - const float x_f_0 = (overlap_x > 0 && x > 0) ? (ix - x_skip) / float(overlap_x) : 1; - const float x_f_1 = (overlap_x > 0 && x < (img_width - width)) ? (width - ix) / float(overlap_x) : 1; - const float y_f_0 = (overlap_y > 0 && y > 0) ? (iy - y_skip) / float(overlap_y) : 1; - const float y_f_1 = (overlap_y > 0 && y < (img_height - height)) ? (height - iy) / float(overlap_y) : 1; - - const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f); - const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f); - - ggml_tensor_set_f32( - output, - old_value + new_value * ggml_smootherstep_f32(y_f) * ggml_smootherstep_f32(x_f), - x + ix, y + iy, k); - } else { - ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k); + for (int l = 0; l < ne3; l++) { + float new_value = ggml_tensor_get_f32(input, ix, iy, k, l); + if (overlap_x > 0 || overlap_y > 0) { // blend colors in overlapped area + float old_value = ggml_tensor_get_f32(output, x + ix, y + iy, k, l); + + const float x_f_0 = (overlap_x > 0 && x > 0) ? (ix - x_skip) / float(overlap_x) : 1; + const float x_f_1 = (overlap_x > 0 && x < (img_width - width)) ? (width - ix) / float(overlap_x) : 1; + const float y_f_0 = (overlap_y > 0 && y > 0) ? (iy - y_skip) / float(overlap_y) : 1; + const float y_f_1 = (overlap_y > 0 && y < (img_height - height)) ? (height - iy) / float(overlap_y) : 1; + + const float x_f = std::min(std::min(x_f_0, x_f_1), 1.f); + const float y_f = std::min(std::min(y_f_0, y_f_1), 1.f); + + ggml_tensor_set_f32( + output, + old_value + new_value * ggml_smootherstep_f32(y_f) * ggml_smootherstep_f32(x_f), + x + ix, y + iy, k, l); + } else { + ggml_tensor_set_f32(output, new_value, x + ix, y + iy, k, l); + } } } } @@ -852,8 +858,8 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, } struct ggml_init_params params = {}; - params.mem_size += input_tile_size_x * input_tile_size_y * input->ne[2] * sizeof(float); // input chunk - params.mem_size += output_tile_size_x * output_tile_size_y * output->ne[2] * sizeof(float); // output chunk + params.mem_size += input_tile_size_x * input_tile_size_y * input->ne[2] * input->ne[3] * sizeof(float); // input chunk + params.mem_size += output_tile_size_x * output_tile_size_y * output->ne[2] * output->ne[3] * sizeof(float); // output chunk params.mem_size += 3 * ggml_tensor_overhead(); params.mem_buffer = NULL; params.no_alloc = false; @@ -868,8 +874,8 @@ __STATIC_INLINE__ void sd_tiling_non_square(ggml_tensor* input, } // tiling - ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], 1); - ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], 1); + ggml_tensor* input_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, input_tile_size_x, input_tile_size_y, input->ne[2], input->ne[3]); + ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, output_tile_size_x, output_tile_size_y, output->ne[2], output->ne[3]); int num_tiles = num_tiles_x * num_tiles_y; LOG_INFO("processing %i tiles", num_tiles); pretty_progress(0, num_tiles, 0.0f); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 3ade99e9..51f8cbe0 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1440,10 +1440,19 @@ class StableDiffusionGGML { if (vae_tiling_params.enabled && !encode_video) { // TODO wan2.2 vae support? int C = sd_version_is_dit(version) ? 16 : 4; - if (!use_tiny_autoencoder) { - C *= 2; + int ne2; + int ne3; + if (sd_version_is_qwen_image(version)) { + ne2 = 1; + ne3 = C*x->ne[3]; + } else { + if (!use_tiny_autoencoder) { + C *= 2; + } + ne2 = C; + ne3 = x->ne[3]; } - result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, C, x->ne[3]); + result = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, W, H, ne2, ne3); } if (sd_version_is_qwen_image(version)) {