diff --git a/piet-gpu-derive/src/glsl.rs b/piet-gpu-derive/src/glsl.rs index b55dda49..24096378 100644 --- a/piet-gpu-derive/src/glsl.rs +++ b/piet-gpu-derive/src/glsl.rs @@ -31,17 +31,22 @@ pub fn gen_glsl(module: &LayoutModule) -> String { for name in &module.def_names { let def = module.defs.get(name).unwrap(); + let mem = &"memory".to_owned(); + let mut buf_name = &module.name; + if !module.name.eq(&"state") && !module.name.eq(&"scene") { + buf_name = mem; + } match def { (_size, LayoutTypeDef::Struct(fields)) => { - gen_struct_read(&mut r, &module.name, &name, fields); + gen_struct_read(&mut r, buf_name, &name, fields); if module.gpu_write { - gen_struct_write(&mut r, &module.name, &name, fields); + gen_struct_write(&mut r, buf_name, &name, fields); } } (_size, LayoutTypeDef::Enum(en)) => { - gen_enum_read(&mut r, &module.name, &name, en); + gen_enum_read(&mut r, buf_name, &name, en); if module.gpu_write { - gen_enum_write(&mut r, &module.name, &name, en); + gen_enum_write(&mut r, buf_name, &name, en); } } } diff --git a/piet-gpu/shader/annotated.h b/piet-gpu/shader/annotated.h index 1e1ebe6a..8a757efa 100644 --- a/piet-gpu/shader/annotated.h +++ b/piet-gpu/shader/annotated.h @@ -64,11 +64,11 @@ AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) { AnnoFill AnnoFill_read(AnnoFillRef ref) { uint ix = ref.offset >> 2; - uint raw0 = annotated[ix + 0]; - uint raw1 = annotated[ix + 1]; - uint raw2 = annotated[ix + 2]; - uint raw3 = annotated[ix + 3]; - uint raw4 = annotated[ix + 4]; + uint raw0 = memory[ix + 0]; + uint raw1 = memory[ix + 1]; + uint raw2 = memory[ix + 2]; + uint raw3 = memory[ix + 3]; + uint raw4 = memory[ix + 4]; AnnoFill s; s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); s.rgba_color = raw4; @@ -77,21 +77,21 @@ AnnoFill AnnoFill_read(AnnoFillRef ref) { void AnnoFill_write(AnnoFillRef ref, AnnoFill s) { uint ix = ref.offset >> 2; - annotated[ix + 0] = floatBitsToUint(s.bbox.x); - annotated[ix + 1] = floatBitsToUint(s.bbox.y); - annotated[ix + 2] = floatBitsToUint(s.bbox.z); - annotated[ix + 3] = floatBitsToUint(s.bbox.w); - annotated[ix + 4] = s.rgba_color; + memory[ix + 0] = floatBitsToUint(s.bbox.x); + memory[ix + 1] = floatBitsToUint(s.bbox.y); + memory[ix + 2] = floatBitsToUint(s.bbox.z); + memory[ix + 3] = floatBitsToUint(s.bbox.w); + memory[ix + 4] = s.rgba_color; } AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) { uint ix = ref.offset >> 2; - uint raw0 = annotated[ix + 0]; - uint raw1 = annotated[ix + 1]; - uint raw2 = annotated[ix + 2]; - uint raw3 = annotated[ix + 3]; - uint raw4 = annotated[ix + 4]; - uint raw5 = annotated[ix + 5]; + uint raw0 = memory[ix + 0]; + uint raw1 = memory[ix + 1]; + uint raw2 = memory[ix + 2]; + uint raw3 = memory[ix + 3]; + uint raw4 = memory[ix + 4]; + uint raw5 = memory[ix + 5]; AnnoStroke s; s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); s.rgba_color = raw4; @@ -101,20 +101,20 @@ AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) { void AnnoStroke_write(AnnoStrokeRef ref, AnnoStroke s) { uint ix = ref.offset >> 2; - annotated[ix + 0] = floatBitsToUint(s.bbox.x); - annotated[ix + 1] = floatBitsToUint(s.bbox.y); - annotated[ix + 2] = floatBitsToUint(s.bbox.z); - annotated[ix + 3] = floatBitsToUint(s.bbox.w); - annotated[ix + 4] = s.rgba_color; - annotated[ix + 5] = floatBitsToUint(s.linewidth); + memory[ix + 0] = floatBitsToUint(s.bbox.x); + memory[ix + 1] = floatBitsToUint(s.bbox.y); + memory[ix + 2] = floatBitsToUint(s.bbox.z); + memory[ix + 3] = floatBitsToUint(s.bbox.w); + memory[ix + 4] = s.rgba_color; + memory[ix + 5] = floatBitsToUint(s.linewidth); } AnnoClip AnnoClip_read(AnnoClipRef ref) { uint ix = ref.offset >> 2; - uint raw0 = annotated[ix + 0]; - uint raw1 = annotated[ix + 1]; - uint raw2 = annotated[ix + 2]; - uint raw3 = annotated[ix + 3]; + uint raw0 = memory[ix + 0]; + uint raw1 = memory[ix + 1]; + uint raw2 = memory[ix + 2]; + uint raw3 = memory[ix + 3]; AnnoClip s; s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3)); return s; @@ -122,14 +122,14 @@ AnnoClip AnnoClip_read(AnnoClipRef ref) { void AnnoClip_write(AnnoClipRef ref, AnnoClip s) { uint ix = ref.offset >> 2; - annotated[ix + 0] = floatBitsToUint(s.bbox.x); - annotated[ix + 1] = floatBitsToUint(s.bbox.y); - annotated[ix + 2] = floatBitsToUint(s.bbox.z); - annotated[ix + 3] = floatBitsToUint(s.bbox.w); + memory[ix + 0] = floatBitsToUint(s.bbox.x); + memory[ix + 1] = floatBitsToUint(s.bbox.y); + memory[ix + 2] = floatBitsToUint(s.bbox.z); + memory[ix + 3] = floatBitsToUint(s.bbox.w); } uint Annotated_tag(AnnotatedRef ref) { - return annotated[ref.offset >> 2]; + return memory[ref.offset >> 2]; } AnnoStroke Annotated_Stroke_read(AnnotatedRef ref) { @@ -149,26 +149,26 @@ AnnoClip Annotated_EndClip_read(AnnotatedRef ref) { } void Annotated_Nop_write(AnnotatedRef ref) { - annotated[ref.offset >> 2] = Annotated_Nop; + memory[ref.offset >> 2] = Annotated_Nop; } void Annotated_Stroke_write(AnnotatedRef ref, AnnoStroke s) { - annotated[ref.offset >> 2] = Annotated_Stroke; + memory[ref.offset >> 2] = Annotated_Stroke; AnnoStroke_write(AnnoStrokeRef(ref.offset + 4), s); } void Annotated_Fill_write(AnnotatedRef ref, AnnoFill s) { - annotated[ref.offset >> 2] = Annotated_Fill; + memory[ref.offset >> 2] = Annotated_Fill; AnnoFill_write(AnnoFillRef(ref.offset + 4), s); } void Annotated_BeginClip_write(AnnotatedRef ref, AnnoClip s) { - annotated[ref.offset >> 2] = Annotated_BeginClip; + memory[ref.offset >> 2] = Annotated_BeginClip; AnnoClip_write(AnnoClipRef(ref.offset + 4), s); } void Annotated_EndClip_write(AnnotatedRef ref, AnnoClip s) { - annotated[ref.offset >> 2] = Annotated_EndClip; + memory[ref.offset >> 2] = Annotated_EndClip; AnnoClip_write(AnnoClipRef(ref.offset + 4), s); } diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp index 42eec9c9..f57d6e08 100644 --- a/piet-gpu/shader/backdrop.comp +++ b/piet-gpu/shader/backdrop.comp @@ -16,27 +16,15 @@ #extension GL_GOOGLE_include_directive : enable #include "setup.h" +#include "mem.h" #define LG_BACKDROP_WG (7 + LG_WG_FACTOR) #define BACKDROP_WG (1 << LG_BACKDROP_WG) layout(local_size_x = BACKDROP_WG, local_size_y = 1) in; -layout(set = 0, binding = 0) buffer AnnotatedBuf { - uint[] annotated; -}; - -// This is really only used for n_elements; maybe we can handle that -// a different way, but it's convenient to have the same signature as -// tile allocation. -layout(set = 0, binding = 1) readonly buffer AllocBuf { - uint n_elements; // paths - uint n_pathseg; - uint alloc; -}; - -layout(set = 0, binding = 2) buffer TileBuf { - uint[] tile; +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; }; #include "annotated.h" @@ -47,18 +35,22 @@ shared uint sh_row_base[BACKDROP_WG]; shared uint sh_row_width[BACKDROP_WG]; void main() { + if (mem_overflow) { + return; + } + uint th_ix = gl_LocalInvocationID.x; uint element_ix = gl_GlobalInvocationID.x; - AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size); + AnnotatedRef ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size); // Work assignment: 1 thread : 1 path element uint row_count = 0; - if (element_ix < n_elements) { + if (element_ix < conf.n_elements) { uint tag = Annotated_tag(ref); switch (tag) { case Annotated_Fill: case Annotated_BeginClip: - PathRef path_ref = PathRef(element_ix * Path_size); + PathRef path_ref = PathRef(conf.tile_base + element_ix * Path_size); Path path = Path_read(path_ref); sh_row_width[th_ix] = path.bbox.z - path.bbox.x; row_count = path.bbox.w - path.bbox.y; @@ -98,11 +90,11 @@ void main() { // Process one row sequentially // Read backdrop value per tile and prefix sum it uint tile_el_ix = sh_row_base[el_ix] + seq_ix * 2 * width; - uint sum = tile[tile_el_ix]; + uint sum = memory[tile_el_ix]; for (uint x = 1; x < width; x++) { tile_el_ix += 2; - sum += tile[tile_el_ix]; - tile[tile_el_ix] = sum; + sum += memory[tile_el_ix]; + memory[tile_el_ix] = sum; } } } diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv index 54bf7368..defe30e7 100644 Binary files a/piet-gpu/shader/backdrop.spv and b/piet-gpu/shader/backdrop.spv differ diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp index fc1d3557..17acc767 100644 --- a/piet-gpu/shader/binning.comp +++ b/piet-gpu/shader/binning.comp @@ -10,20 +10,12 @@ #extension GL_GOOGLE_include_directive : enable #include "setup.h" +#include "mem.h" layout(local_size_x = N_TILE, local_size_y = 1) in; -layout(set = 0, binding = 0) buffer AnnotatedBuf { - uint[] annotated; -}; - -layout(set = 0, binding = 1) buffer AllocBuf { - uint n_elements; // paths - uint alloc; -}; - -layout(set = 0, binding = 2) buffer BinsBuf { - uint[] bins; +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; }; #include "annotated.h" @@ -41,19 +33,27 @@ layout(set = 0, binding = 2) buffer BinsBuf { shared uint bitmaps[N_SLICE][N_TILE]; shared uint count[N_SLICE][N_TILE]; shared uint sh_chunk_start[N_TILE]; +shared bool sh_alloc_failed; void main() { - uint my_n_elements = n_elements; + if (mem_overflow) { + return; + } + + uint my_n_elements = conf.n_elements; uint my_partition = gl_WorkGroupID.x; for (uint i = 0; i < N_SLICE; i++) { bitmaps[i][gl_LocalInvocationID.x] = 0; } + if (gl_LocalInvocationID.x == 0) { + sh_alloc_failed = false; + } barrier(); // Read inputs and determine coverage of bins uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x; - AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size); + AnnotatedRef ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size); uint tag = Annotated_Nop; if (element_ix < my_n_elements) { tag = Annotated_tag(ref); @@ -103,19 +103,26 @@ void main() { count[i][gl_LocalInvocationID.x] = element_count; } // element_count is number of elements covering bin for this invocation. - uint chunk_start = 0; + Alloc chunk_alloc = Alloc(0, false); if (element_count != 0) { // TODO: aggregate atomic adds (subgroup is probably fastest) - chunk_start = atomicAdd(alloc, element_count * BinInstance_size); - sh_chunk_start[gl_LocalInvocationID.x] = chunk_start; + chunk_alloc = malloc(element_count * BinInstance_size); + sh_chunk_start[gl_LocalInvocationID.x] = chunk_alloc.offset; + if (chunk_alloc.failed) { + sh_alloc_failed = true; + } } // Note: it might be more efficient for reading to do this in the // other order (each bin is a contiguous sequence of partitions) - uint out_ix = (my_partition * N_TILE + gl_LocalInvocationID.x) * 2; - bins[out_ix] = element_count; - bins[out_ix + 1] = chunk_start; + uint out_ix = (conf.bin_base >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2; + memory[out_ix] = element_count; + memory[out_ix + 1] = chunk_alloc.offset; barrier(); + if (sh_alloc_failed) { + return; + } + // Use similar strategy as Laine & Karras paper; loop over bbox of bins // touched by this element x = x0; diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv index abe17d49..da2df762 100644 Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ diff --git a/piet-gpu/shader/bins.h b/piet-gpu/shader/bins.h index bc32dda0..43642785 100644 --- a/piet-gpu/shader/bins.h +++ b/piet-gpu/shader/bins.h @@ -18,7 +18,7 @@ BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) { BinInstance BinInstance_read(BinInstanceRef ref) { uint ix = ref.offset >> 2; - uint raw0 = bins[ix + 0]; + uint raw0 = memory[ix + 0]; BinInstance s; s.element_ix = raw0; return s; @@ -26,6 +26,6 @@ BinInstance BinInstance_read(BinInstanceRef ref) { void BinInstance_write(BinInstanceRef ref, BinInstance s) { uint ix = ref.offset >> 2; - bins[ix + 0] = s.element_ix; + memory[ix + 0] = s.element_ix; } diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp index a173608b..a70318ad 100644 --- a/piet-gpu/shader/coarse.comp +++ b/piet-gpu/shader/coarse.comp @@ -14,28 +14,12 @@ #extension GL_GOOGLE_include_directive : enable #include "setup.h" +#include "mem.h" layout(local_size_x = N_TILE, local_size_y = 1) in; -layout(set = 0, binding = 0) buffer AnnotatedBuf { - uint[] annotated; -}; - -layout(set = 0, binding = 1) buffer BinsBuf { - uint[] bins; -}; - -layout(set = 0, binding = 2) buffer TileBuf { - uint[] tile; -}; - -layout(set = 0, binding = 3) buffer AllocBuf { - uint n_elements; - uint alloc; -}; - -layout(set = 0, binding = 4) buffer PtclBuf { - uint[] ptcl; +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; }; #include "annotated.h" @@ -65,22 +49,31 @@ shared uint sh_tile_base[N_TILE]; shared uint sh_tile_stride[N_TILE]; // Perhaps cmd_limit should be a global? This is a style question. -void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) { - if (cmd_ref.offset > cmd_limit) { - uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC); - CmdJump jump = CmdJump(new_cmd); - Cmd_Jump_write(cmd_ref, jump); - cmd_ref = CmdRef(new_cmd); - cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size; +bool alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) { + if (cmd_ref.offset < cmd_limit) { + return true; } + Alloc new_cmd = malloc(PTCL_INITIAL_ALLOC); + if (new_cmd.failed) { + return false; + } + CmdJump jump = CmdJump(new_cmd.offset); + Cmd_Jump_write(cmd_ref, jump); + cmd_ref = CmdRef(new_cmd.offset); + cmd_limit = new_cmd.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; + return true; } void main() { + if (mem_overflow) { + return; + } + // Could use either linear or 2d layouts for both dispatch and // invocations within the workgroup. We'll use variables to abstract. uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x; uint partition_ix = 0; - uint n_partitions = (n_elements + N_TILE - 1) / N_TILE; + uint n_partitions = (conf.n_elements + N_TILE - 1) / N_TILE; uint th_ix = gl_LocalInvocationID.x; // Coordinates of top left of bin, in tiles. @@ -91,7 +84,7 @@ void main() { uint tile_x = gl_LocalInvocationID.x % N_TILE_X; uint tile_y = gl_LocalInvocationID.x / N_TILE_X; uint this_tile_ix = (bin_tile_y + tile_y) * WIDTH_IN_TILES + bin_tile_x + tile_x; - CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC); + CmdRef cmd_ref = CmdRef(conf.ptcl_base + this_tile_ix * PTCL_INITIAL_ALLOC); uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size; // The nesting depth of the clip stack uint clip_depth = 0; @@ -123,9 +116,9 @@ void main() { part_start_ix = ready_ix; uint count = 0; if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) { - uint in_ix = ((partition_ix + th_ix) * N_TILE + bin_ix) * 2; - count = bins[in_ix]; - sh_part_elements[th_ix] = bins[in_ix + 1]; + uint in_ix = (conf.bin_base >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2; + count = memory[in_ix]; + sh_part_elements[th_ix] = memory[in_ix + 1]; } // prefix sum of counts for (uint i = 0; i < LG_N_PART_READ; i++) { @@ -175,7 +168,7 @@ void main() { AnnotatedRef ref; if (th_ix + rd_ix < wr_ix) { element_ix = sh_elements[th_ix]; - ref = AnnotatedRef(element_ix * Annotated_size); + ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size); tag = Annotated_tag(ref); } @@ -189,7 +182,7 @@ void main() { // We have one "path" for each element, even if the element isn't // actually a path (currently EndClip, but images etc in the future). uint path_ix = element_ix; - Path path = Path_read(PathRef(path_ix * Path_size)); + Path path = Path_read(PathRef(conf.tile_base + path_ix * Path_size)); uint stride = path.bbox.z - path.bbox.x; sh_tile_stride[th_ix] = stride; int dx = int(path.bbox.x) - int(bin_tile_x); @@ -232,7 +225,7 @@ void main() { el_ix = probe; } } - AnnotatedRef ref = AnnotatedRef(sh_elements[el_ix] * Annotated_size); + AnnotatedRef ref = AnnotatedRef(conf.anno_base + sh_elements[el_ix] * Annotated_size); uint tag = Annotated_tag(ref); uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0); uint width = sh_tile_width[el_ix]; @@ -281,7 +274,7 @@ void main() { // At this point, we read the element again from global memory. // If that turns out to be expensive, maybe we can pack it into // shared memory (or perhaps just the tag). - ref = AnnotatedRef(element_ix * Annotated_size); + ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size); tag = Annotated_tag(ref); if (clip_zero_depth == 0) { @@ -290,7 +283,9 @@ void main() { Tile tile = Tile_read(TileRef(sh_tile_base[element_ref_ix] + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size)); AnnoFill fill = Annotated_Fill_read(ref); - alloc_cmd(cmd_ref, cmd_limit); + if (!alloc_cmd(cmd_ref, cmd_limit)) { + break; + } if (tile.tile.offset != 0) { CmdFill cmd_fill; cmd_fill.tile_ref = tile.tile.offset; @@ -310,7 +305,9 @@ void main() { } else if (tile.tile.offset == 0 && clip_depth < 32) { clip_one_mask |= (1 << clip_depth); } else { - alloc_cmd(cmd_ref, cmd_limit); + if (!alloc_cmd(cmd_ref, cmd_limit)) { + break; + } if (tile.tile.offset != 0) { CmdBeginClip cmd_begin_clip; cmd_begin_clip.tile_ref = tile.tile.offset; @@ -331,7 +328,9 @@ void main() { case Annotated_EndClip: clip_depth--; if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) { - alloc_cmd(cmd_ref, cmd_limit); + if (!alloc_cmd(cmd_ref, cmd_limit)) { + break; + } Cmd_EndClip_write(cmd_ref, CmdEndClip(1.0)); cmd_ref.offset += Cmd_size; } @@ -344,7 +343,9 @@ void main() { cmd_stroke.tile_ref = tile.tile.offset; cmd_stroke.half_width = 0.5 * stroke.linewidth; cmd_stroke.rgba_color = stroke.rgba_color; - alloc_cmd(cmd_ref, cmd_limit); + if (!alloc_cmd(cmd_ref, cmd_limit)) { + break; + } Cmd_Stroke_write(cmd_ref, cmd_stroke); cmd_ref.offset += Cmd_size; break; diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv index 215a97ab..260db696 100644 Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp index 5e8957f1..a0e50112 100644 --- a/piet-gpu/shader/elements.comp +++ b/piet-gpu/shader/elements.comp @@ -9,6 +9,9 @@ #version 450 #extension GL_GOOGLE_include_directive : enable +#include "setup.h" +#include "mem.h" + #define N_ROWS 4 #define WG_SIZE 32 #define LG_WG_SIZE 5 @@ -16,28 +19,22 @@ layout(local_size_x = WG_SIZE, local_size_y = 1) in; -layout(set = 0, binding = 0) readonly buffer SceneBuf { +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; +}; + +layout(set = 0, binding = 2) readonly buffer SceneBuf { uint[] scene; }; // It would be better to use the Vulkan memory model than // "volatile" but shooting for compatibility here rather // than doing things right. -layout(set = 0, binding = 1) volatile buffer StateBuf { +layout(set = 0, binding = 3) volatile buffer StateBuf { uint part_counter; uint[] state; }; -// The annotated results are stored here. -layout(set = 0, binding = 2) buffer AnnotatedBuf { - uint[] annotated; -}; - -// Path segments are stored here. -layout(set = 0, binding = 3) buffer PathSegBuf { - uint[] pathseg; -}; - #include "scene.h" #include "state.h" #include "annotated.h" @@ -175,6 +172,10 @@ shared uint sh_part_ix; shared State sh_prefix; void main() { + if (mem_overflow) { + return; + } + State th_state[N_ROWS]; // Determine partition to process by atomic counter (described in Section // 4.4 of prefix sum paper). @@ -341,9 +342,9 @@ void main() { } // We do encoding a bit by hand to minimize divergence. Another approach // would be to have a fill/stroke bool. - PathSegRef path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size); + PathSegRef path_out_ref = PathSegRef(conf.pathseg_base + (st.pathseg_count - 1) * PathSeg_size); uint out_tag = tag == Element_FillLine ? PathSeg_FillCubic : PathSeg_StrokeCubic; - pathseg[path_out_ref.offset >> 2] = out_tag; + memory[path_out_ref.offset >> 2] = out_tag; PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic); break; case Element_FillQuad: @@ -365,9 +366,9 @@ void main() { } // We do encoding a bit by hand to minimize divergence. Another approach // would be to have a fill/stroke bool. - path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size); + path_out_ref = PathSegRef(conf.pathseg_base + (st.pathseg_count - 1) * PathSeg_size); out_tag = tag == Element_FillQuad ? PathSeg_FillCubic : PathSeg_StrokeCubic; - pathseg[path_out_ref.offset >> 2] = out_tag; + memory[path_out_ref.offset >> 2] = out_tag; PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic); break; case Element_FillCubic: @@ -386,9 +387,9 @@ void main() { } // We do encoding a bit by hand to minimize divergence. Another approach // would be to have a fill/stroke bool. - path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size); + path_out_ref = PathSegRef(conf.pathseg_base + (st.pathseg_count - 1) * PathSeg_size); out_tag = tag == Element_FillCubic ? PathSeg_FillCubic : PathSeg_StrokeCubic; - pathseg[path_out_ref.offset >> 2] = out_tag; + memory[path_out_ref.offset >> 2] = out_tag; PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic); break; case Element_Stroke: @@ -398,7 +399,7 @@ void main() { vec2 lw = get_linewidth(st); anno_stroke.bbox = st.bbox + vec4(-lw, lw); anno_stroke.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z)); - AnnotatedRef out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size); + AnnotatedRef out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size); Annotated_Stroke_write(out_ref, anno_stroke); break; case Element_Fill: @@ -406,7 +407,7 @@ void main() { AnnoFill anno_fill; anno_fill.rgba_color = fill.rgba_color; anno_fill.bbox = st.bbox; - out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size); + out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size); Annotated_Fill_write(out_ref, anno_fill); break; case Element_BeginClip: @@ -414,14 +415,14 @@ void main() { AnnoClip anno_begin_clip = AnnoClip(begin_clip.bbox); // This is the absolute bbox, it's been transformed during encoding. anno_begin_clip.bbox = begin_clip.bbox; - out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size); + out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size); Annotated_BeginClip_write(out_ref, anno_begin_clip); break; case Element_EndClip: Clip end_clip = Element_EndClip_read(this_ref); // This bbox is expected to be the same as the begin one. AnnoClip anno_end_clip = AnnoClip(end_clip.bbox); - out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size); + out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size); Annotated_EndClip_write(out_ref, anno_end_clip); break; } diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv index fd314c81..95171f82 100644 Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp index d01627ca..0183f332 100644 --- a/piet-gpu/shader/kernel4.comp +++ b/piet-gpu/shader/kernel4.comp @@ -11,50 +11,42 @@ #extension GL_EXT_nonuniform_qualifier : enable #include "setup.h" +#include "mem.h" #define CHUNK 8 #define CHUNK_DY (TILE_HEIGHT_PX / CHUNK) layout(local_size_x = TILE_WIDTH_PX, local_size_y = CHUNK_DY) in; -// Same concern that this should be readonly as in kernel 3. -layout(set = 0, binding = 0) buffer PtclBuf { - uint[] ptcl; +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; }; -layout(set = 0, binding = 1) buffer TileBuf { - uint[] tile; -}; - -layout(set = 0, binding = 2) buffer ClipScratchBuf { - uint[] clip_scratch; -}; - -layout(rgba8, set = 0, binding = 3) uniform writeonly image2D image; +layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image; -layout(set = 0, binding = 4) uniform sampler2D textures[]; +layout(set = 0, binding = 3) uniform sampler2D textures[]; #include "ptcl.h" #include "tile.h" #define BLEND_STACK_SIZE 4 -// Layout of clip_scratch buffer: -// [0] is the alloc bump offset (in units of 32 bit words, initially 0) -// Starting at 1 is a sequence of frames. +// Layout of a clip scratch frame: // Each frame is WIDTH * HEIGHT 32-bit words, then a link reference. +// Link offset and frame size in 32-bit words. #define CLIP_LINK_OFFSET (TILE_WIDTH_PX * TILE_HEIGHT_PX) #define CLIP_BUF_SIZE (CLIP_LINK_OFFSET + 1) -shared uint sh_clip_alloc; +shared Alloc sh_clip_alloc; -// Allocate a scratch buffer for clipping. Unlike offsets in the rest of the code, -// it counts 32-bit words. -uint alloc_clip_buf(uint link) { +// Allocate a scratch buffer for clipping. +Alloc alloc_clip_buf(uint link) { if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) { - uint alloc = atomicAdd(clip_scratch[0], CLIP_BUF_SIZE) + 1; + Alloc alloc = malloc(CLIP_BUF_SIZE * 4); + if (!alloc.failed) { + memory[(alloc.offset >> 2) + CLIP_LINK_OFFSET] = link; + } sh_clip_alloc = alloc; - clip_scratch[alloc + CLIP_LINK_OFFSET] = link; } barrier(); return sh_clip_alloc; @@ -95,8 +87,12 @@ float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) { } void main() { + if (mem_overflow) { + return; + } + uint tile_ix = gl_WorkGroupID.y * WIDTH_IN_TILES + gl_WorkGroupID.x; - CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC); + CmdRef cmd_ref = CmdRef(conf.ptcl_base + tile_ix * PTCL_INITIAL_ALLOC); uvec2 xy_uint = uvec2(gl_GlobalInvocationID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y); vec2 xy = vec2(xy_uint); @@ -168,10 +164,14 @@ void main() { uint blend_slot = blend_sp % BLEND_STACK_SIZE; if (blend_sp == blend_spill + BLEND_STACK_SIZE) { // spill to scratch buffer - clip_tos = alloc_clip_buf(clip_tos); - uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y; + Alloc alloc = alloc_clip_buf(clip_tos); + if (alloc.failed) { + return; + } + clip_tos = alloc.offset; + uint base_ix = (clip_tos >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y; for (uint k = 0; k < CHUNK; k++) { - clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY] = blend_stack[blend_slot][k]; + memory[base_ix + k * TILE_WIDTH_PX * CHUNK_DY] = blend_stack[blend_slot][k]; } blend_spill++; } @@ -194,11 +194,11 @@ void main() { CmdEndClip end_clip = Cmd_EndClip_read(cmd_ref); blend_slot = (blend_sp - 1) % BLEND_STACK_SIZE; if (blend_sp == blend_spill) { - uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y; + uint base_ix = (clip_tos >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y; for (uint k = 0; k < CHUNK; k++) { - blend_stack[blend_slot][k] = clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY]; + blend_stack[blend_slot][k] = memory[base_ix + k * TILE_WIDTH_PX * CHUNK_DY]; } - clip_tos = clip_scratch[clip_tos + CLIP_LINK_OFFSET]; + clip_tos = memory[(clip_tos >> 2) + CLIP_LINK_OFFSET]; blend_spill--; } blend_sp--; diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv index 33ed4f8d..f7acb7f2 100644 Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ diff --git a/piet-gpu/shader/mem.h b/piet-gpu/shader/mem.h new file mode 100644 index 00000000..9373cbfa --- /dev/null +++ b/piet-gpu/shader/mem.h @@ -0,0 +1,29 @@ +// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense + +layout(set = 0, binding = 0) buffer Memory { + // offset into memory of the next allocation, initialized by the user. + uint mem_offset; + bool mem_overflow; + uint[] memory; +}; + +// Alloc represents a memory allocation. +struct Alloc { + // offset in bytes into memory. + uint offset; + // failed is true if the allocation overflowed memory. + bool failed; +}; + +// malloc allocates size bytes of memory. +Alloc malloc(uint size) { + Alloc a; + // Round up to nearest 32-bit word. + size = (size + 3) & ~3; + a.offset = atomicAdd(mem_offset, size); + a.failed = a.offset + size > memory.length() * 4; + if (a.failed) { + mem_overflow = true; + } + return a; +} diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp index cbca10fe..20c35866 100644 --- a/piet-gpu/shader/path_coarse.comp +++ b/piet-gpu/shader/path_coarse.comp @@ -8,24 +8,15 @@ #extension GL_GOOGLE_include_directive : enable #include "setup.h" +#include "mem.h" #define LG_COARSE_WG 5 #define COARSE_WG (1 << LG_COARSE_WG) layout(local_size_x = COARSE_WG, local_size_y = 1) in; -layout(set = 0, binding = 0) buffer PathSegBuf { - uint[] pathseg; -}; - -layout(set = 0, binding = 1) buffer AllocBuf { - uint n_paths; - uint n_pathseg; - uint alloc; -}; - -layout(set = 0, binding = 2) buffer TileBuf { - uint[] tile; +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; }; #include "pathseg.h" @@ -96,11 +87,15 @@ SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) { } void main() { + if (mem_overflow) { + return; + } + uint element_ix = gl_GlobalInvocationID.x; - PathSegRef ref = PathSegRef(element_ix * PathSeg_size); + PathSegRef ref = PathSegRef(conf.pathseg_base + element_ix * PathSeg_size); uint tag = PathSeg_Nop; - if (element_ix < n_pathseg) { + if (element_ix < conf.n_pathseg) { tag = PathSeg_tag(ref); } switch (tag) { @@ -128,7 +123,7 @@ void main() { uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1); uint path_ix = cubic.path_ix; - Path path = Path_read(PathRef(path_ix * Path_size)); + Path path = Path_read(PathRef(conf.tile_base + path_ix * Path_size)); ivec4 bbox = ivec4(path.bbox); vec2 p0 = cubic.p0; qp0 = cubic.p0; @@ -187,7 +182,12 @@ void main() { // TODO: can be tighter, use c to bound width uint n_tile_alloc = uint((x1 - x0) * (y1 - y0)); // Consider using subgroups to aggregate atomic add. - uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size); + Alloc tile_alloc = malloc(n_tile_alloc * TileSeg_size); + if (tile_alloc.failed) { + return; + } + uint tile_offset = tile_alloc.offset; + TileSeg tile_seg; int xray = int(floor(p0.x*SX)); @@ -204,7 +204,7 @@ void main() { int backdrop = p1.y < p0.y ? 1 : -1; TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop)); uint tile_el = tile_ref.offset >> 2; - atomicAdd(tile[tile_el + 1], backdrop); + atomicAdd(memory[tile_el + 1], backdrop); } // next_xray is the xray for the next scanline; the line segment intersects @@ -227,7 +227,7 @@ void main() { float tile_x0 = float(x * TILE_WIDTH_PX); TileRef tile_ref = Tile_index(path.tiles, uint(base + x)); uint tile_el = tile_ref.offset >> 2; - uint old = atomicExchange(tile[tile_el], tile_offset); + uint old = atomicExchange(memory[tile_el], tile_offset); tile_seg.origin = p0; tile_seg.vector = p1 - p0; float y_edge = 0.0; diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv index bec287b0..6b2e3b34 100644 Binary files a/piet-gpu/shader/path_coarse.spv and b/piet-gpu/shader/path_coarse.spv differ diff --git a/piet-gpu/shader/pathseg.h b/piet-gpu/shader/pathseg.h index 4ce6c460..ecba9c5d 100644 --- a/piet-gpu/shader/pathseg.h +++ b/piet-gpu/shader/pathseg.h @@ -89,11 +89,11 @@ PathSegRef PathSeg_index(PathSegRef ref, uint index) { PathFillLine PathFillLine_read(PathFillLineRef ref) { uint ix = ref.offset >> 2; - uint raw0 = pathseg[ix + 0]; - uint raw1 = pathseg[ix + 1]; - uint raw2 = pathseg[ix + 2]; - uint raw3 = pathseg[ix + 3]; - uint raw4 = pathseg[ix + 4]; + uint raw0 = memory[ix + 0]; + uint raw1 = memory[ix + 1]; + uint raw2 = memory[ix + 2]; + uint raw3 = memory[ix + 3]; + uint raw4 = memory[ix + 4]; PathFillLine s; s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); @@ -103,22 +103,22 @@ PathFillLine PathFillLine_read(PathFillLineRef ref) { void PathFillLine_write(PathFillLineRef ref, PathFillLine s) { uint ix = ref.offset >> 2; - pathseg[ix + 0] = floatBitsToUint(s.p0.x); - pathseg[ix + 1] = floatBitsToUint(s.p0.y); - pathseg[ix + 2] = floatBitsToUint(s.p1.x); - pathseg[ix + 3] = floatBitsToUint(s.p1.y); - pathseg[ix + 4] = s.path_ix; + memory[ix + 0] = floatBitsToUint(s.p0.x); + memory[ix + 1] = floatBitsToUint(s.p0.y); + memory[ix + 2] = floatBitsToUint(s.p1.x); + memory[ix + 3] = floatBitsToUint(s.p1.y); + memory[ix + 4] = s.path_ix; } PathStrokeLine PathStrokeLine_read(PathStrokeLineRef ref) { uint ix = ref.offset >> 2; - uint raw0 = pathseg[ix + 0]; - uint raw1 = pathseg[ix + 1]; - uint raw2 = pathseg[ix + 2]; - uint raw3 = pathseg[ix + 3]; - uint raw4 = pathseg[ix + 4]; - uint raw5 = pathseg[ix + 5]; - uint raw6 = pathseg[ix + 6]; + uint raw0 = memory[ix + 0]; + uint raw1 = memory[ix + 1]; + uint raw2 = memory[ix + 2]; + uint raw3 = memory[ix + 3]; + uint raw4 = memory[ix + 4]; + uint raw5 = memory[ix + 5]; + uint raw6 = memory[ix + 6]; PathStrokeLine s; s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); @@ -129,26 +129,26 @@ PathStrokeLine PathStrokeLine_read(PathStrokeLineRef ref) { void PathStrokeLine_write(PathStrokeLineRef ref, PathStrokeLine s) { uint ix = ref.offset >> 2; - pathseg[ix + 0] = floatBitsToUint(s.p0.x); - pathseg[ix + 1] = floatBitsToUint(s.p0.y); - pathseg[ix + 2] = floatBitsToUint(s.p1.x); - pathseg[ix + 3] = floatBitsToUint(s.p1.y); - pathseg[ix + 4] = s.path_ix; - pathseg[ix + 5] = floatBitsToUint(s.stroke.x); - pathseg[ix + 6] = floatBitsToUint(s.stroke.y); + memory[ix + 0] = floatBitsToUint(s.p0.x); + memory[ix + 1] = floatBitsToUint(s.p0.y); + memory[ix + 2] = floatBitsToUint(s.p1.x); + memory[ix + 3] = floatBitsToUint(s.p1.y); + memory[ix + 4] = s.path_ix; + memory[ix + 5] = floatBitsToUint(s.stroke.x); + memory[ix + 6] = floatBitsToUint(s.stroke.y); } PathFillCubic PathFillCubic_read(PathFillCubicRef ref) { uint ix = ref.offset >> 2; - uint raw0 = pathseg[ix + 0]; - uint raw1 = pathseg[ix + 1]; - uint raw2 = pathseg[ix + 2]; - uint raw3 = pathseg[ix + 3]; - uint raw4 = pathseg[ix + 4]; - uint raw5 = pathseg[ix + 5]; - uint raw6 = pathseg[ix + 6]; - uint raw7 = pathseg[ix + 7]; - uint raw8 = pathseg[ix + 8]; + uint raw0 = memory[ix + 0]; + uint raw1 = memory[ix + 1]; + uint raw2 = memory[ix + 2]; + uint raw3 = memory[ix + 3]; + uint raw4 = memory[ix + 4]; + uint raw5 = memory[ix + 5]; + uint raw6 = memory[ix + 6]; + uint raw7 = memory[ix + 7]; + uint raw8 = memory[ix + 8]; PathFillCubic s; s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); @@ -160,30 +160,30 @@ PathFillCubic PathFillCubic_read(PathFillCubicRef ref) { void PathFillCubic_write(PathFillCubicRef ref, PathFillCubic s) { uint ix = ref.offset >> 2; - pathseg[ix + 0] = floatBitsToUint(s.p0.x); - pathseg[ix + 1] = floatBitsToUint(s.p0.y); - pathseg[ix + 2] = floatBitsToUint(s.p1.x); - pathseg[ix + 3] = floatBitsToUint(s.p1.y); - pathseg[ix + 4] = floatBitsToUint(s.p2.x); - pathseg[ix + 5] = floatBitsToUint(s.p2.y); - pathseg[ix + 6] = floatBitsToUint(s.p3.x); - pathseg[ix + 7] = floatBitsToUint(s.p3.y); - pathseg[ix + 8] = s.path_ix; + memory[ix + 0] = floatBitsToUint(s.p0.x); + memory[ix + 1] = floatBitsToUint(s.p0.y); + memory[ix + 2] = floatBitsToUint(s.p1.x); + memory[ix + 3] = floatBitsToUint(s.p1.y); + memory[ix + 4] = floatBitsToUint(s.p2.x); + memory[ix + 5] = floatBitsToUint(s.p2.y); + memory[ix + 6] = floatBitsToUint(s.p3.x); + memory[ix + 7] = floatBitsToUint(s.p3.y); + memory[ix + 8] = s.path_ix; } PathStrokeCubic PathStrokeCubic_read(PathStrokeCubicRef ref) { uint ix = ref.offset >> 2; - uint raw0 = pathseg[ix + 0]; - uint raw1 = pathseg[ix + 1]; - uint raw2 = pathseg[ix + 2]; - uint raw3 = pathseg[ix + 3]; - uint raw4 = pathseg[ix + 4]; - uint raw5 = pathseg[ix + 5]; - uint raw6 = pathseg[ix + 6]; - uint raw7 = pathseg[ix + 7]; - uint raw8 = pathseg[ix + 8]; - uint raw9 = pathseg[ix + 9]; - uint raw10 = pathseg[ix + 10]; + uint raw0 = memory[ix + 0]; + uint raw1 = memory[ix + 1]; + uint raw2 = memory[ix + 2]; + uint raw3 = memory[ix + 3]; + uint raw4 = memory[ix + 4]; + uint raw5 = memory[ix + 5]; + uint raw6 = memory[ix + 6]; + uint raw7 = memory[ix + 7]; + uint raw8 = memory[ix + 8]; + uint raw9 = memory[ix + 9]; + uint raw10 = memory[ix + 10]; PathStrokeCubic s; s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); @@ -196,21 +196,21 @@ PathStrokeCubic PathStrokeCubic_read(PathStrokeCubicRef ref) { void PathStrokeCubic_write(PathStrokeCubicRef ref, PathStrokeCubic s) { uint ix = ref.offset >> 2; - pathseg[ix + 0] = floatBitsToUint(s.p0.x); - pathseg[ix + 1] = floatBitsToUint(s.p0.y); - pathseg[ix + 2] = floatBitsToUint(s.p1.x); - pathseg[ix + 3] = floatBitsToUint(s.p1.y); - pathseg[ix + 4] = floatBitsToUint(s.p2.x); - pathseg[ix + 5] = floatBitsToUint(s.p2.y); - pathseg[ix + 6] = floatBitsToUint(s.p3.x); - pathseg[ix + 7] = floatBitsToUint(s.p3.y); - pathseg[ix + 8] = s.path_ix; - pathseg[ix + 9] = floatBitsToUint(s.stroke.x); - pathseg[ix + 10] = floatBitsToUint(s.stroke.y); + memory[ix + 0] = floatBitsToUint(s.p0.x); + memory[ix + 1] = floatBitsToUint(s.p0.y); + memory[ix + 2] = floatBitsToUint(s.p1.x); + memory[ix + 3] = floatBitsToUint(s.p1.y); + memory[ix + 4] = floatBitsToUint(s.p2.x); + memory[ix + 5] = floatBitsToUint(s.p2.y); + memory[ix + 6] = floatBitsToUint(s.p3.x); + memory[ix + 7] = floatBitsToUint(s.p3.y); + memory[ix + 8] = s.path_ix; + memory[ix + 9] = floatBitsToUint(s.stroke.x); + memory[ix + 10] = floatBitsToUint(s.stroke.y); } uint PathSeg_tag(PathSegRef ref) { - return pathseg[ref.offset >> 2]; + return memory[ref.offset >> 2]; } PathFillLine PathSeg_FillLine_read(PathSegRef ref) { @@ -230,26 +230,26 @@ PathStrokeCubic PathSeg_StrokeCubic_read(PathSegRef ref) { } void PathSeg_Nop_write(PathSegRef ref) { - pathseg[ref.offset >> 2] = PathSeg_Nop; + memory[ref.offset >> 2] = PathSeg_Nop; } void PathSeg_FillLine_write(PathSegRef ref, PathFillLine s) { - pathseg[ref.offset >> 2] = PathSeg_FillLine; + memory[ref.offset >> 2] = PathSeg_FillLine; PathFillLine_write(PathFillLineRef(ref.offset + 4), s); } void PathSeg_StrokeLine_write(PathSegRef ref, PathStrokeLine s) { - pathseg[ref.offset >> 2] = PathSeg_StrokeLine; + memory[ref.offset >> 2] = PathSeg_StrokeLine; PathStrokeLine_write(PathStrokeLineRef(ref.offset + 4), s); } void PathSeg_FillCubic_write(PathSegRef ref, PathFillCubic s) { - pathseg[ref.offset >> 2] = PathSeg_FillCubic; + memory[ref.offset >> 2] = PathSeg_FillCubic; PathFillCubic_write(PathFillCubicRef(ref.offset + 4), s); } void PathSeg_StrokeCubic_write(PathSegRef ref, PathStrokeCubic s) { - pathseg[ref.offset >> 2] = PathSeg_StrokeCubic; + memory[ref.offset >> 2] = PathSeg_StrokeCubic; PathStrokeCubic_write(PathStrokeCubicRef(ref.offset + 4), s); } diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h index 20b362ed..eb21eac0 100644 --- a/piet-gpu/shader/ptcl.h +++ b/piet-gpu/shader/ptcl.h @@ -173,10 +173,10 @@ CmdRef Cmd_index(CmdRef ref, uint index) { CmdCircle CmdCircle_read(CmdCircleRef ref) { uint ix = ref.offset >> 2; - uint raw0 = ptcl[ix + 0]; - uint raw1 = ptcl[ix + 1]; - uint raw2 = ptcl[ix + 2]; - uint raw3 = ptcl[ix + 3]; + uint raw0 = memory[ix + 0]; + uint raw1 = memory[ix + 1]; + uint raw2 = memory[ix + 2]; + uint raw3 = memory[ix + 3]; CmdCircle s; s.center = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.radius = uintBitsToFloat(raw2); @@ -186,18 +186,18 @@ CmdCircle CmdCircle_read(CmdCircleRef ref) { void CmdCircle_write(CmdCircleRef ref, CmdCircle s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = floatBitsToUint(s.center.x); - ptcl[ix + 1] = floatBitsToUint(s.center.y); - ptcl[ix + 2] = floatBitsToUint(s.radius); - ptcl[ix + 3] = s.rgba_color; + memory[ix + 0] = floatBitsToUint(s.center.x); + memory[ix + 1] = floatBitsToUint(s.center.y); + memory[ix + 2] = floatBitsToUint(s.radius); + memory[ix + 3] = s.rgba_color; } CmdLine CmdLine_read(CmdLineRef ref) { uint ix = ref.offset >> 2; - uint raw0 = ptcl[ix + 0]; - uint raw1 = ptcl[ix + 1]; - uint raw2 = ptcl[ix + 2]; - uint raw3 = ptcl[ix + 3]; + uint raw0 = memory[ix + 0]; + uint raw1 = memory[ix + 1]; + uint raw2 = memory[ix + 2]; + uint raw3 = memory[ix + 3]; CmdLine s; s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); @@ -206,17 +206,17 @@ CmdLine CmdLine_read(CmdLineRef ref) { void CmdLine_write(CmdLineRef ref, CmdLine s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = floatBitsToUint(s.start.x); - ptcl[ix + 1] = floatBitsToUint(s.start.y); - ptcl[ix + 2] = floatBitsToUint(s.end.x); - ptcl[ix + 3] = floatBitsToUint(s.end.y); + memory[ix + 0] = floatBitsToUint(s.start.x); + memory[ix + 1] = floatBitsToUint(s.start.y); + memory[ix + 2] = floatBitsToUint(s.end.x); + memory[ix + 3] = floatBitsToUint(s.end.y); } CmdStroke CmdStroke_read(CmdStrokeRef ref) { uint ix = ref.offset >> 2; - uint raw0 = ptcl[ix + 0]; - uint raw1 = ptcl[ix + 1]; - uint raw2 = ptcl[ix + 2]; + uint raw0 = memory[ix + 0]; + uint raw1 = memory[ix + 1]; + uint raw2 = memory[ix + 2]; CmdStroke s; s.tile_ref = raw0; s.half_width = uintBitsToFloat(raw1); @@ -226,16 +226,16 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) { void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = s.tile_ref; - ptcl[ix + 1] = floatBitsToUint(s.half_width); - ptcl[ix + 2] = s.rgba_color; + memory[ix + 0] = s.tile_ref; + memory[ix + 1] = floatBitsToUint(s.half_width); + memory[ix + 2] = s.rgba_color; } CmdFill CmdFill_read(CmdFillRef ref) { uint ix = ref.offset >> 2; - uint raw0 = ptcl[ix + 0]; - uint raw1 = ptcl[ix + 1]; - uint raw2 = ptcl[ix + 2]; + uint raw0 = memory[ix + 0]; + uint raw1 = memory[ix + 1]; + uint raw2 = memory[ix + 2]; CmdFill s; s.tile_ref = raw0; s.backdrop = int(raw1); @@ -245,15 +245,15 @@ CmdFill CmdFill_read(CmdFillRef ref) { void CmdFill_write(CmdFillRef ref, CmdFill s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = s.tile_ref; - ptcl[ix + 1] = uint(s.backdrop); - ptcl[ix + 2] = s.rgba_color; + memory[ix + 0] = s.tile_ref; + memory[ix + 1] = uint(s.backdrop); + memory[ix + 2] = s.rgba_color; } CmdBeginClip CmdBeginClip_read(CmdBeginClipRef ref) { uint ix = ref.offset >> 2; - uint raw0 = ptcl[ix + 0]; - uint raw1 = ptcl[ix + 1]; + uint raw0 = memory[ix + 0]; + uint raw1 = memory[ix + 1]; CmdBeginClip s; s.tile_ref = raw0; s.backdrop = int(raw1); @@ -262,13 +262,13 @@ CmdBeginClip CmdBeginClip_read(CmdBeginClipRef ref) { void CmdBeginClip_write(CmdBeginClipRef ref, CmdBeginClip s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = s.tile_ref; - ptcl[ix + 1] = uint(s.backdrop); + memory[ix + 0] = s.tile_ref; + memory[ix + 1] = uint(s.backdrop); } CmdBeginSolidClip CmdBeginSolidClip_read(CmdBeginSolidClipRef ref) { uint ix = ref.offset >> 2; - uint raw0 = ptcl[ix + 0]; + uint raw0 = memory[ix + 0]; CmdBeginSolidClip s; s.alpha = uintBitsToFloat(raw0); return s; @@ -276,12 +276,12 @@ CmdBeginSolidClip CmdBeginSolidClip_read(CmdBeginSolidClipRef ref) { void CmdBeginSolidClip_write(CmdBeginSolidClipRef ref, CmdBeginSolidClip s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = floatBitsToUint(s.alpha); + memory[ix + 0] = floatBitsToUint(s.alpha); } CmdEndClip CmdEndClip_read(CmdEndClipRef ref) { uint ix = ref.offset >> 2; - uint raw0 = ptcl[ix + 0]; + uint raw0 = memory[ix + 0]; CmdEndClip s; s.alpha = uintBitsToFloat(raw0); return s; @@ -289,12 +289,12 @@ CmdEndClip CmdEndClip_read(CmdEndClipRef ref) { void CmdEndClip_write(CmdEndClipRef ref, CmdEndClip s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = floatBitsToUint(s.alpha); + memory[ix + 0] = floatBitsToUint(s.alpha); } CmdSolid CmdSolid_read(CmdSolidRef ref) { uint ix = ref.offset >> 2; - uint raw0 = ptcl[ix + 0]; + uint raw0 = memory[ix + 0]; CmdSolid s; s.rgba_color = raw0; return s; @@ -302,12 +302,12 @@ CmdSolid CmdSolid_read(CmdSolidRef ref) { void CmdSolid_write(CmdSolidRef ref, CmdSolid s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = s.rgba_color; + memory[ix + 0] = s.rgba_color; } CmdSolidMask CmdSolidMask_read(CmdSolidMaskRef ref) { uint ix = ref.offset >> 2; - uint raw0 = ptcl[ix + 0]; + uint raw0 = memory[ix + 0]; CmdSolidMask s; s.mask = uintBitsToFloat(raw0); return s; @@ -315,12 +315,12 @@ CmdSolidMask CmdSolidMask_read(CmdSolidMaskRef ref) { void CmdSolidMask_write(CmdSolidMaskRef ref, CmdSolidMask s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = floatBitsToUint(s.mask); + memory[ix + 0] = floatBitsToUint(s.mask); } CmdJump CmdJump_read(CmdJumpRef ref) { uint ix = ref.offset >> 2; - uint raw0 = ptcl[ix + 0]; + uint raw0 = memory[ix + 0]; CmdJump s; s.new_ref = raw0; return s; @@ -328,11 +328,11 @@ CmdJump CmdJump_read(CmdJumpRef ref) { void CmdJump_write(CmdJumpRef ref, CmdJump s) { uint ix = ref.offset >> 2; - ptcl[ix + 0] = s.new_ref; + memory[ix + 0] = s.new_ref; } uint Cmd_tag(CmdRef ref) { - return ptcl[ref.offset >> 2]; + return memory[ref.offset >> 2]; } CmdCircle Cmd_Circle_read(CmdRef ref) { @@ -376,56 +376,56 @@ CmdJump Cmd_Jump_read(CmdRef ref) { } void Cmd_End_write(CmdRef ref) { - ptcl[ref.offset >> 2] = Cmd_End; + memory[ref.offset >> 2] = Cmd_End; } void Cmd_Circle_write(CmdRef ref, CmdCircle s) { - ptcl[ref.offset >> 2] = Cmd_Circle; + memory[ref.offset >> 2] = Cmd_Circle; CmdCircle_write(CmdCircleRef(ref.offset + 4), s); } void Cmd_Line_write(CmdRef ref, CmdLine s) { - ptcl[ref.offset >> 2] = Cmd_Line; + memory[ref.offset >> 2] = Cmd_Line; CmdLine_write(CmdLineRef(ref.offset + 4), s); } void Cmd_Fill_write(CmdRef ref, CmdFill s) { - ptcl[ref.offset >> 2] = Cmd_Fill; + memory[ref.offset >> 2] = Cmd_Fill; CmdFill_write(CmdFillRef(ref.offset + 4), s); } void Cmd_BeginClip_write(CmdRef ref, CmdBeginClip s) { - ptcl[ref.offset >> 2] = Cmd_BeginClip; + memory[ref.offset >> 2] = Cmd_BeginClip; CmdBeginClip_write(CmdBeginClipRef(ref.offset + 4), s); } void Cmd_BeginSolidClip_write(CmdRef ref, CmdBeginSolidClip s) { - ptcl[ref.offset >> 2] = Cmd_BeginSolidClip; + memory[ref.offset >> 2] = Cmd_BeginSolidClip; CmdBeginSolidClip_write(CmdBeginSolidClipRef(ref.offset + 4), s); } void Cmd_EndClip_write(CmdRef ref, CmdEndClip s) { - ptcl[ref.offset >> 2] = Cmd_EndClip; + memory[ref.offset >> 2] = Cmd_EndClip; CmdEndClip_write(CmdEndClipRef(ref.offset + 4), s); } void Cmd_Stroke_write(CmdRef ref, CmdStroke s) { - ptcl[ref.offset >> 2] = Cmd_Stroke; + memory[ref.offset >> 2] = Cmd_Stroke; CmdStroke_write(CmdStrokeRef(ref.offset + 4), s); } void Cmd_Solid_write(CmdRef ref, CmdSolid s) { - ptcl[ref.offset >> 2] = Cmd_Solid; + memory[ref.offset >> 2] = Cmd_Solid; CmdSolid_write(CmdSolidRef(ref.offset + 4), s); } void Cmd_SolidMask_write(CmdRef ref, CmdSolidMask s) { - ptcl[ref.offset >> 2] = Cmd_SolidMask; + memory[ref.offset >> 2] = Cmd_SolidMask; CmdSolidMask_write(CmdSolidMaskRef(ref.offset + 4), s); } void Cmd_Jump_write(CmdRef ref, CmdJump s) { - ptcl[ref.offset >> 2] = Cmd_Jump; + memory[ref.offset >> 2] = Cmd_Jump; CmdJump_write(CmdJumpRef(ref.offset + 4), s); } diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h index 6998a16f..9a7d580d 100644 --- a/piet-gpu/shader/setup.h +++ b/piet-gpu/shader/setup.h @@ -28,3 +28,13 @@ #define N_TILE (N_TILE_X * N_TILE_Y) #define LG_N_TILE (7 + LG_WG_FACTOR) #define N_SLICE (N_TILE / 32) + +struct Config { + uint n_elements; // paths + uint n_pathseg; + uint tile_base; + uint bin_base; + uint ptcl_base; + uint pathseg_base; + uint anno_base; +}; diff --git a/piet-gpu/shader/tile.h b/piet-gpu/shader/tile.h index a33cb5ab..133ff53e 100644 --- a/piet-gpu/shader/tile.h +++ b/piet-gpu/shader/tile.h @@ -51,9 +51,9 @@ TileSegRef TileSeg_index(TileSegRef ref, uint index) { Path Path_read(PathRef ref) { uint ix = ref.offset >> 2; - uint raw0 = tile[ix + 0]; - uint raw1 = tile[ix + 1]; - uint raw2 = tile[ix + 2]; + uint raw0 = memory[ix + 0]; + uint raw1 = memory[ix + 1]; + uint raw2 = memory[ix + 2]; Path s; s.bbox = uvec4(raw0 & 0xffff, raw0 >> 16, raw1 & 0xffff, raw1 >> 16); s.tiles = TileRef(raw2); @@ -62,15 +62,15 @@ Path Path_read(PathRef ref) { void Path_write(PathRef ref, Path s) { uint ix = ref.offset >> 2; - tile[ix + 0] = s.bbox.x | (s.bbox.y << 16); - tile[ix + 1] = s.bbox.z | (s.bbox.w << 16); - tile[ix + 2] = s.tiles.offset; + memory[ix + 0] = s.bbox.x | (s.bbox.y << 16); + memory[ix + 1] = s.bbox.z | (s.bbox.w << 16); + memory[ix + 2] = s.tiles.offset; } Tile Tile_read(TileRef ref) { uint ix = ref.offset >> 2; - uint raw0 = tile[ix + 0]; - uint raw1 = tile[ix + 1]; + uint raw0 = memory[ix + 0]; + uint raw1 = memory[ix + 1]; Tile s; s.tile = TileSegRef(raw0); s.backdrop = int(raw1); @@ -79,18 +79,18 @@ Tile Tile_read(TileRef ref) { void Tile_write(TileRef ref, Tile s) { uint ix = ref.offset >> 2; - tile[ix + 0] = s.tile.offset; - tile[ix + 1] = uint(s.backdrop); + memory[ix + 0] = s.tile.offset; + memory[ix + 1] = uint(s.backdrop); } TileSeg TileSeg_read(TileSegRef ref) { uint ix = ref.offset >> 2; - uint raw0 = tile[ix + 0]; - uint raw1 = tile[ix + 1]; - uint raw2 = tile[ix + 2]; - uint raw3 = tile[ix + 3]; - uint raw4 = tile[ix + 4]; - uint raw5 = tile[ix + 5]; + uint raw0 = memory[ix + 0]; + uint raw1 = memory[ix + 1]; + uint raw2 = memory[ix + 2]; + uint raw3 = memory[ix + 3]; + uint raw4 = memory[ix + 4]; + uint raw5 = memory[ix + 5]; TileSeg s; s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1)); s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3)); @@ -101,11 +101,11 @@ TileSeg TileSeg_read(TileSegRef ref) { void TileSeg_write(TileSegRef ref, TileSeg s) { uint ix = ref.offset >> 2; - tile[ix + 0] = floatBitsToUint(s.origin.x); - tile[ix + 1] = floatBitsToUint(s.origin.y); - tile[ix + 2] = floatBitsToUint(s.vector.x); - tile[ix + 3] = floatBitsToUint(s.vector.y); - tile[ix + 4] = floatBitsToUint(s.y_edge); - tile[ix + 5] = s.next.offset; + memory[ix + 0] = floatBitsToUint(s.origin.x); + memory[ix + 1] = floatBitsToUint(s.origin.y); + memory[ix + 2] = floatBitsToUint(s.vector.x); + memory[ix + 3] = floatBitsToUint(s.vector.y); + memory[ix + 4] = floatBitsToUint(s.y_edge); + memory[ix + 5] = s.next.offset; } diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp index 64529d1c..3280f7ff 100644 --- a/piet-gpu/shader/tile_alloc.comp +++ b/piet-gpu/shader/tile_alloc.comp @@ -6,24 +6,15 @@ #extension GL_GOOGLE_include_directive : enable #include "setup.h" +#include "mem.h" #define LG_TILE_ALLOC_WG (7 + LG_WG_FACTOR) #define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG) layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in; -layout(set = 0, binding = 0) buffer AnnotatedBuf { - uint[] annotated; -}; - -layout(set = 0, binding = 1) buffer AllocBuf { - uint n_elements; - uint n_pathseg; - uint alloc; -}; - -layout(set = 0, binding = 2) buffer TileBuf { - uint[] tile; +layout(set = 0, binding = 1) readonly buffer ConfigBuf { + Config conf; }; #include "annotated.h" @@ -34,16 +25,20 @@ layout(set = 0, binding = 2) buffer TileBuf { #define SY (1.0 / float(TILE_HEIGHT_PX)) shared uint sh_tile_count[TILE_ALLOC_WG]; -shared uint sh_tile_alloc; +shared Alloc sh_tile_alloc; void main() { + if (mem_overflow) { + return; + } + uint th_ix = gl_LocalInvocationID.x; uint element_ix = gl_GlobalInvocationID.x; - PathRef path_ref = PathRef(element_ix * Path_size); - AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size); + PathRef path_ref = PathRef(conf.tile_base + element_ix * Path_size); + AnnotatedRef ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size); uint tag = Annotated_Nop; - if (element_ix < n_elements) { + if (element_ix < conf.n_elements) { tag = Annotated_tag(ref); } int x0 = 0, y0 = 0, x1 = 0, y1 = 0; @@ -86,23 +81,26 @@ void main() { sh_tile_count[th_ix] = tile_count; } if (th_ix == TILE_ALLOC_WG - 1) { - sh_tile_alloc = atomicAdd(alloc, tile_count * Tile_size); + sh_tile_alloc = malloc(tile_count * Tile_size); } barrier(); - uint alloc_start = sh_tile_alloc; + Alloc alloc_start = sh_tile_alloc; + if (alloc_start.failed) { + return; + } - if (element_ix < n_elements) { + if (element_ix < conf.n_elements) { uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0; - path.tiles = TileRef(alloc_start + Tile_size * tile_subix); + path.tiles = TileRef(alloc_start.offset + Tile_size * tile_subix); Path_write(path_ref, path); } // Zero out allocated tiles efficiently uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4); - uint start_ix = alloc_start >> 2; + uint start_ix = alloc_start.offset >> 2; for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) { // Note: this interleaving is faster than using Tile_write // by a significant amount. - tile[start_ix + i] = 0; + memory[start_ix + i] = 0; } } diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv index e901bad1..e407222c 100644 Binary files a/piet-gpu/shader/tile_alloc.spv and b/piet-gpu/shader/tile_alloc.spv differ diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs index 18688886..02726648 100644 --- a/piet-gpu/src/lib.rs +++ b/piet-gpu/src/lib.rs @@ -156,15 +156,16 @@ pub fn dump_k1_data(k1_buf: &[u32]) { pub struct Renderer { pub image_dev: hub::Image, // resulting image - scene_buf: hub::Buffer, - scene_dev: hub::Buffer, + scene_buf_host: hub::Buffer, + scene_buf_dev: hub::Buffer, - pub state_buf: hub::Buffer, - pub anno_buf: hub::Buffer, - pub pathseg_buf: hub::Buffer, - pub tile_buf: hub::Buffer, - pub bin_buf: hub::Buffer, - pub ptcl_buf: hub::Buffer, + memory_buf_host: hub::Buffer, + memory_buf_dev: hub::Buffer, + + state_buf: hub::Buffer, + + config_buf_host: hub::Buffer, + config_buf_dev: hub::Buffer, el_pipeline: hub::Pipeline, el_ds: hub::DescriptorSet, @@ -178,23 +179,12 @@ pub struct Renderer { backdrop_pipeline: hub::Pipeline, backdrop_ds: hub::DescriptorSet, - tile_alloc_buf_host: hub::Buffer, - tile_alloc_buf_dev: hub::Buffer, - bin_pipeline: hub::Pipeline, bin_ds: hub::DescriptorSet, - bin_alloc_buf_host: hub::Buffer, - bin_alloc_buf_dev: hub::Buffer, - coarse_pipeline: hub::Pipeline, coarse_ds: hub::DescriptorSet, - coarse_alloc_buf_host: hub::Buffer, - coarse_alloc_buf_dev: hub::Buffer, - - clip_scratch_buf: hub::Buffer, - k4_pipeline: hub::Pipeline, k4_ds: hub::DescriptorSet, @@ -221,88 +211,83 @@ impl Renderer { n_elements, n_paths, n_pathseg ); - let mut scene_buf = session + let mut scene_buf_host = session .create_buffer(std::mem::size_of_val(&scene[..]) as u64, host) .unwrap(); - let scene_dev = session + let scene_buf_dev = session .create_buffer(std::mem::size_of_val(&scene[..]) as u64, dev) .unwrap(); - scene_buf.write(&scene)?; + scene_buf_host.write(&scene)?; let state_buf = session.create_buffer(1 * 1024 * 1024, dev)?; - let anno_buf = session.create_buffer(64 * 1024 * 1024, dev)?; - let pathseg_buf = session.create_buffer(64 * 1024 * 1024, dev)?; - let tile_buf = session.create_buffer(64 * 1024 * 1024, dev)?; - let bin_buf = session.create_buffer(64 * 1024 * 1024, dev)?; - let ptcl_buf = session.create_buffer(48 * 1024 * 1024, dev)?; let image_dev = session.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?; + let mut config_buf_host = session.create_buffer(7*4, host)?; + let config_buf_dev = session.create_buffer(7*4, dev)?; + + // TODO: constants + const PATH_SIZE: usize = 12; + const BIN_SIZE: usize = 8; + const PATHSEG_SIZE: usize = 48; + const ANNO_SIZE: usize = 28; + let mut alloc = 0; + let tile_base = alloc; + alloc += ((n_paths + 3) & !3) * PATH_SIZE; + let bin_base = alloc; + alloc += ((n_paths + 255) & !255) * BIN_SIZE; + let ptcl_base = alloc; + alloc += WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC; + let pathseg_base = alloc; + alloc += (n_pathseg * PATHSEG_SIZE + 3) & !3; + let anno_base = alloc; + alloc += (n_paths * ANNO_SIZE + 3) & !3; + config_buf_host.write(&[n_paths as u32, n_pathseg as u32, tile_base as u32, bin_base as u32, ptcl_base as u32, pathseg_base as u32, anno_base as u32])?; + + let mut memory_buf_host = session.create_buffer(2*4, host)?; + let memory_buf_dev = session.create_buffer(128 * 1024 * 1024, dev)?; + memory_buf_host.write(&[alloc as u32, 0 /* Overflow flag */])?; + let el_code = include_bytes!("../shader/elements.spv"); let el_pipeline = session.create_simple_compute_pipeline(el_code, 4)?; let el_ds = session.create_simple_descriptor_set( &el_pipeline, - &[&scene_dev, &state_buf, &anno_buf, &pathseg_buf], + &[&memory_buf_dev, &config_buf_dev, &scene_buf_dev, &state_buf], )?; - let mut tile_alloc_buf_host = session.create_buffer(12, host)?; - let tile_alloc_buf_dev = session.create_buffer(12, dev)?; - - // TODO: constants - const PATH_SIZE: usize = 12; - let tile_alloc_start = ((n_paths + 31) & !31) * PATH_SIZE; - tile_alloc_buf_host.write(&[n_paths as u32, n_pathseg as u32, tile_alloc_start as u32])?; let tile_alloc_code = include_bytes!("../shader/tile_alloc.spv"); - let tile_pipeline = session.create_simple_compute_pipeline(tile_alloc_code, 3)?; + let tile_pipeline = session.create_simple_compute_pipeline(tile_alloc_code, 2)?; let tile_ds = session.create_simple_descriptor_set( &tile_pipeline, - &[&anno_buf, &tile_alloc_buf_dev, &tile_buf], + &[&memory_buf_dev, &config_buf_dev], )?; let path_alloc_code = include_bytes!("../shader/path_coarse.spv"); - let path_pipeline = session.create_simple_compute_pipeline(path_alloc_code, 3)?; + let path_pipeline = session.create_simple_compute_pipeline(path_alloc_code, 2)?; let path_ds = session.create_simple_descriptor_set( &path_pipeline, - &[&pathseg_buf, &tile_alloc_buf_dev, &tile_buf], + &[&memory_buf_dev, &config_buf_dev], )?; let backdrop_alloc_code = include_bytes!("../shader/backdrop.spv"); - let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 3)?; + let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 2)?; let backdrop_ds = session.create_simple_descriptor_set( &backdrop_pipeline, - &[&anno_buf, &tile_alloc_buf_dev, &tile_buf], + &[&memory_buf_dev, &config_buf_dev], )?; - let mut bin_alloc_buf_host = session.create_buffer(8, host)?; - let bin_alloc_buf_dev = session.create_buffer(8, dev)?; - // TODO: constants - let bin_alloc_start = ((n_paths + 255) & !255) * 8; - bin_alloc_buf_host.write(&[n_paths as u32, bin_alloc_start as u32])?; let bin_code = include_bytes!("../shader/binning.spv"); - let bin_pipeline = session.create_simple_compute_pipeline(bin_code, 3)?; + let bin_pipeline = session.create_simple_compute_pipeline(bin_code, 2)?; let bin_ds = session.create_simple_descriptor_set( &bin_pipeline, - &[&anno_buf, &bin_alloc_buf_dev, &bin_buf], + &[&memory_buf_dev, &config_buf_dev], )?; - let clip_scratch_buf = session.create_buffer(1024 * 1024, dev)?; - - let mut coarse_alloc_buf_host = session.create_buffer(8, host)?; - let coarse_alloc_buf_dev = session.create_buffer(8, dev)?; - - let coarse_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC; - coarse_alloc_buf_host.write(&[n_paths as u32, coarse_alloc_start as u32])?; let coarse_code = include_bytes!("../shader/coarse.spv"); - let coarse_pipeline = session.create_simple_compute_pipeline(coarse_code, 5)?; + let coarse_pipeline = session.create_simple_compute_pipeline(coarse_code, 2)?; let coarse_ds = session.create_simple_descriptor_set( &coarse_pipeline, - &[ - &anno_buf, - &bin_buf, - &tile_buf, - &coarse_alloc_buf_dev, - &ptcl_buf, - ], + &[&memory_buf_dev, &config_buf_dev], )?; let bg_image = Self::make_test_bg_image(&session); @@ -318,20 +303,25 @@ impl Renderer { let sampler = session.create_sampler(SamplerParams::Linear)?; let k4_pipeline = session .pipeline_builder() - .add_buffers(3) + .add_buffers(2) .add_images(1) .add_textures(max_textures) .create_compute_pipeline(&session, k4_code)?; let k4_ds = session .descriptor_set_builder() - .add_buffers(&[&ptcl_buf, &tile_buf, &clip_scratch_buf]) + .add_buffers(&[&memory_buf_dev, &config_buf_dev]) .add_images(&[&image_dev]) .add_textures(&[&bg_image], &sampler) .build(&session, &k4_pipeline)?; Ok(Renderer { - scene_buf, - scene_dev, + scene_buf_host, + scene_buf_dev, + memory_buf_host, + memory_buf_dev, + state_buf, + config_buf_host, + config_buf_dev, image_dev, el_pipeline, el_ds, @@ -347,19 +337,6 @@ impl Renderer { coarse_ds, k4_pipeline, k4_ds, - state_buf, - anno_buf, - pathseg_buf, - tile_buf, - bin_buf, - ptcl_buf, - tile_alloc_buf_host, - tile_alloc_buf_dev, - bin_alloc_buf_host, - bin_alloc_buf_dev, - coarse_alloc_buf_host, - coarse_alloc_buf_dev, - clip_scratch_buf, n_elements, n_paths, n_pathseg, @@ -368,21 +345,16 @@ impl Renderer { } pub unsafe fn record(&self, cmd_buf: &mut hub::CmdBuf, query_pool: &hub::QueryPool) { - cmd_buf.copy_buffer(self.scene_buf.vk_buffer(), self.scene_dev.vk_buffer()); - cmd_buf.copy_buffer( - self.tile_alloc_buf_host.vk_buffer(), - self.tile_alloc_buf_dev.vk_buffer(), - ); + cmd_buf.copy_buffer(self.scene_buf_host.vk_buffer(), self.scene_buf_dev.vk_buffer()); cmd_buf.copy_buffer( - self.bin_alloc_buf_host.vk_buffer(), - self.bin_alloc_buf_dev.vk_buffer(), + self.config_buf_host.vk_buffer(), + self.config_buf_dev.vk_buffer(), ); cmd_buf.copy_buffer( - self.coarse_alloc_buf_host.vk_buffer(), - self.coarse_alloc_buf_dev.vk_buffer(), + self.memory_buf_host.vk_buffer(), + self.memory_buf_dev.vk_buffer(), ); cmd_buf.clear_buffer(self.state_buf.vk_buffer(), None); - cmd_buf.clear_buffer(self.clip_scratch_buf.vk_buffer(), Some(4)); cmd_buf.memory_barrier(); cmd_buf.image_barrier( self.image_dev.vk_image(),