diff --git a/piet-gpu-derive/src/glsl.rs b/piet-gpu-derive/src/glsl.rs
index b55dda49..24096378 100644
--- a/piet-gpu-derive/src/glsl.rs
+++ b/piet-gpu-derive/src/glsl.rs
@@ -31,17 +31,22 @@ pub fn gen_glsl(module: &LayoutModule) -> String {
 
     for name in &module.def_names {
         let def = module.defs.get(name).unwrap();
+        let mem = &"memory".to_owned();
+        let mut buf_name = &module.name;
+        if !module.name.eq(&"state") && !module.name.eq(&"scene") {
+            buf_name = mem;
+        }
         match def {
             (_size, LayoutTypeDef::Struct(fields)) => {
-                gen_struct_read(&mut r, &module.name, &name, fields);
+                gen_struct_read(&mut r, buf_name, &name, fields);
                 if module.gpu_write {
-                    gen_struct_write(&mut r, &module.name, &name, fields);
+                    gen_struct_write(&mut r, buf_name, &name, fields);
                 }
             }
             (_size, LayoutTypeDef::Enum(en)) => {
-                gen_enum_read(&mut r, &module.name, &name, en);
+                gen_enum_read(&mut r, buf_name, &name, en);
                 if module.gpu_write {
-                    gen_enum_write(&mut r, &module.name, &name, en);
+                    gen_enum_write(&mut r, buf_name, &name, en);
                 }
             }
         }
diff --git a/piet-gpu/shader/annotated.h b/piet-gpu/shader/annotated.h
index 1e1ebe6a..8a757efa 100644
--- a/piet-gpu/shader/annotated.h
+++ b/piet-gpu/shader/annotated.h
@@ -64,11 +64,11 @@ AnnotatedRef Annotated_index(AnnotatedRef ref, uint index) {
 
 AnnoFill AnnoFill_read(AnnoFillRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = annotated[ix + 0];
-    uint raw1 = annotated[ix + 1];
-    uint raw2 = annotated[ix + 2];
-    uint raw3 = annotated[ix + 3];
-    uint raw4 = annotated[ix + 4];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
+    uint raw4 = memory[ix + 4];
     AnnoFill s;
     s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
     s.rgba_color = raw4;
@@ -77,21 +77,21 @@ AnnoFill AnnoFill_read(AnnoFillRef ref) {
 
 void AnnoFill_write(AnnoFillRef ref, AnnoFill s) {
     uint ix = ref.offset >> 2;
-    annotated[ix + 0] = floatBitsToUint(s.bbox.x);
-    annotated[ix + 1] = floatBitsToUint(s.bbox.y);
-    annotated[ix + 2] = floatBitsToUint(s.bbox.z);
-    annotated[ix + 3] = floatBitsToUint(s.bbox.w);
-    annotated[ix + 4] = s.rgba_color;
+    memory[ix + 0] = floatBitsToUint(s.bbox.x);
+    memory[ix + 1] = floatBitsToUint(s.bbox.y);
+    memory[ix + 2] = floatBitsToUint(s.bbox.z);
+    memory[ix + 3] = floatBitsToUint(s.bbox.w);
+    memory[ix + 4] = s.rgba_color;
 }
 
 AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = annotated[ix + 0];
-    uint raw1 = annotated[ix + 1];
-    uint raw2 = annotated[ix + 2];
-    uint raw3 = annotated[ix + 3];
-    uint raw4 = annotated[ix + 4];
-    uint raw5 = annotated[ix + 5];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
+    uint raw4 = memory[ix + 4];
+    uint raw5 = memory[ix + 5];
     AnnoStroke s;
     s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
     s.rgba_color = raw4;
@@ -101,20 +101,20 @@ AnnoStroke AnnoStroke_read(AnnoStrokeRef ref) {
 
 void AnnoStroke_write(AnnoStrokeRef ref, AnnoStroke s) {
     uint ix = ref.offset >> 2;
-    annotated[ix + 0] = floatBitsToUint(s.bbox.x);
-    annotated[ix + 1] = floatBitsToUint(s.bbox.y);
-    annotated[ix + 2] = floatBitsToUint(s.bbox.z);
-    annotated[ix + 3] = floatBitsToUint(s.bbox.w);
-    annotated[ix + 4] = s.rgba_color;
-    annotated[ix + 5] = floatBitsToUint(s.linewidth);
+    memory[ix + 0] = floatBitsToUint(s.bbox.x);
+    memory[ix + 1] = floatBitsToUint(s.bbox.y);
+    memory[ix + 2] = floatBitsToUint(s.bbox.z);
+    memory[ix + 3] = floatBitsToUint(s.bbox.w);
+    memory[ix + 4] = s.rgba_color;
+    memory[ix + 5] = floatBitsToUint(s.linewidth);
 }
 
 AnnoClip AnnoClip_read(AnnoClipRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = annotated[ix + 0];
-    uint raw1 = annotated[ix + 1];
-    uint raw2 = annotated[ix + 2];
-    uint raw3 = annotated[ix + 3];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
     AnnoClip s;
     s.bbox = vec4(uintBitsToFloat(raw0), uintBitsToFloat(raw1), uintBitsToFloat(raw2), uintBitsToFloat(raw3));
     return s;
@@ -122,14 +122,14 @@ AnnoClip AnnoClip_read(AnnoClipRef ref) {
 
 void AnnoClip_write(AnnoClipRef ref, AnnoClip s) {
     uint ix = ref.offset >> 2;
-    annotated[ix + 0] = floatBitsToUint(s.bbox.x);
-    annotated[ix + 1] = floatBitsToUint(s.bbox.y);
-    annotated[ix + 2] = floatBitsToUint(s.bbox.z);
-    annotated[ix + 3] = floatBitsToUint(s.bbox.w);
+    memory[ix + 0] = floatBitsToUint(s.bbox.x);
+    memory[ix + 1] = floatBitsToUint(s.bbox.y);
+    memory[ix + 2] = floatBitsToUint(s.bbox.z);
+    memory[ix + 3] = floatBitsToUint(s.bbox.w);
 }
 
 uint Annotated_tag(AnnotatedRef ref) {
-    return annotated[ref.offset >> 2];
+    return memory[ref.offset >> 2];
 }
 
 AnnoStroke Annotated_Stroke_read(AnnotatedRef ref) {
@@ -149,26 +149,26 @@ AnnoClip Annotated_EndClip_read(AnnotatedRef ref) {
 }
 
 void Annotated_Nop_write(AnnotatedRef ref) {
-    annotated[ref.offset >> 2] = Annotated_Nop;
+    memory[ref.offset >> 2] = Annotated_Nop;
 }
 
 void Annotated_Stroke_write(AnnotatedRef ref, AnnoStroke s) {
-    annotated[ref.offset >> 2] = Annotated_Stroke;
+    memory[ref.offset >> 2] = Annotated_Stroke;
     AnnoStroke_write(AnnoStrokeRef(ref.offset + 4), s);
 }
 
 void Annotated_Fill_write(AnnotatedRef ref, AnnoFill s) {
-    annotated[ref.offset >> 2] = Annotated_Fill;
+    memory[ref.offset >> 2] = Annotated_Fill;
     AnnoFill_write(AnnoFillRef(ref.offset + 4), s);
 }
 
 void Annotated_BeginClip_write(AnnotatedRef ref, AnnoClip s) {
-    annotated[ref.offset >> 2] = Annotated_BeginClip;
+    memory[ref.offset >> 2] = Annotated_BeginClip;
     AnnoClip_write(AnnoClipRef(ref.offset + 4), s);
 }
 
 void Annotated_EndClip_write(AnnotatedRef ref, AnnoClip s) {
-    annotated[ref.offset >> 2] = Annotated_EndClip;
+    memory[ref.offset >> 2] = Annotated_EndClip;
     AnnoClip_write(AnnoClipRef(ref.offset + 4), s);
 }
 
diff --git a/piet-gpu/shader/backdrop.comp b/piet-gpu/shader/backdrop.comp
index 42eec9c9..f57d6e08 100644
--- a/piet-gpu/shader/backdrop.comp
+++ b/piet-gpu/shader/backdrop.comp
@@ -16,27 +16,15 @@
 #extension GL_GOOGLE_include_directive : enable
 
 #include "setup.h"
+#include "mem.h"
 
 #define LG_BACKDROP_WG (7 + LG_WG_FACTOR)
 #define BACKDROP_WG (1 << LG_BACKDROP_WG)
 
 layout(local_size_x = BACKDROP_WG, local_size_y = 1) in;
 
-layout(set = 0, binding = 0) buffer AnnotatedBuf {
-    uint[] annotated;
-};
-
-// This is really only used for n_elements; maybe we can handle that
-// a different way, but it's convenient to have the same signature as
-// tile allocation.
-layout(set = 0, binding = 1) readonly buffer AllocBuf {
-    uint n_elements; // paths
-    uint n_pathseg;
-    uint alloc;
-};
-
-layout(set = 0, binding = 2) buffer TileBuf {
-    uint[] tile;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };
 
 #include "annotated.h"
@@ -47,18 +35,22 @@ shared uint sh_row_base[BACKDROP_WG];
 shared uint sh_row_width[BACKDROP_WG];
 
 void main() {
+    if (mem_overflow) {
+        return;
+    }
+
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
+    AnnotatedRef ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size);
 
     // Work assignment: 1 thread : 1 path element
     uint row_count = 0;
-    if (element_ix < n_elements) {
+    if (element_ix < conf.n_elements) {
         uint tag = Annotated_tag(ref);
         switch (tag) {
         case Annotated_Fill:
         case Annotated_BeginClip:
-            PathRef path_ref = PathRef(element_ix * Path_size);
+            PathRef path_ref = PathRef(conf.tile_base + element_ix * Path_size);
             Path path = Path_read(path_ref);
             sh_row_width[th_ix] = path.bbox.z - path.bbox.x;
             row_count = path.bbox.w - path.bbox.y;
@@ -98,11 +90,11 @@ void main() {
         // Process one row sequentially
         // Read backdrop value per tile and prefix sum it
         uint tile_el_ix = sh_row_base[el_ix] + seq_ix * 2 * width;
-        uint sum = tile[tile_el_ix];
+        uint sum = memory[tile_el_ix];
         for (uint x = 1; x < width; x++) {
             tile_el_ix += 2;
-            sum += tile[tile_el_ix];
-            tile[tile_el_ix] = sum;
+            sum += memory[tile_el_ix];
+            memory[tile_el_ix] = sum;
         }
     }
 }
diff --git a/piet-gpu/shader/backdrop.spv b/piet-gpu/shader/backdrop.spv
index 54bf7368..defe30e7 100644
Binary files a/piet-gpu/shader/backdrop.spv and b/piet-gpu/shader/backdrop.spv differ
diff --git a/piet-gpu/shader/binning.comp b/piet-gpu/shader/binning.comp
index fc1d3557..17acc767 100644
--- a/piet-gpu/shader/binning.comp
+++ b/piet-gpu/shader/binning.comp
@@ -10,20 +10,12 @@
 #extension GL_GOOGLE_include_directive : enable
 
 #include "setup.h"
+#include "mem.h"
 
 layout(local_size_x = N_TILE, local_size_y = 1) in;
 
-layout(set = 0, binding = 0) buffer AnnotatedBuf {
-    uint[] annotated;
-};
-
-layout(set = 0, binding = 1) buffer AllocBuf {
-    uint n_elements; // paths
-    uint alloc;
-};
-
-layout(set = 0, binding = 2) buffer BinsBuf {
-    uint[] bins;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };
 
 #include "annotated.h"
@@ -41,19 +33,27 @@ layout(set = 0, binding = 2) buffer BinsBuf {
 shared uint bitmaps[N_SLICE][N_TILE];
 shared uint count[N_SLICE][N_TILE];
 shared uint sh_chunk_start[N_TILE];
+shared bool sh_alloc_failed;
 
 void main() {
-    uint my_n_elements = n_elements;
+    if (mem_overflow) {
+        return;
+    }
+
+    uint my_n_elements = conf.n_elements;
     uint my_partition = gl_WorkGroupID.x;
 
     for (uint i = 0; i < N_SLICE; i++) {
         bitmaps[i][gl_LocalInvocationID.x] = 0;
     }
+    if (gl_LocalInvocationID.x == 0) {
+        sh_alloc_failed = false;
+    }
     barrier();
 
     // Read inputs and determine coverage of bins
     uint element_ix = my_partition * N_TILE + gl_LocalInvocationID.x;
-    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
+    AnnotatedRef ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size);
     uint tag = Annotated_Nop;
     if (element_ix < my_n_elements) {
         tag = Annotated_tag(ref);
@@ -103,19 +103,26 @@ void main() {
         count[i][gl_LocalInvocationID.x] = element_count;
     }
     // element_count is number of elements covering bin for this invocation.
-    uint chunk_start = 0;
+    Alloc chunk_alloc = Alloc(0, false);
     if (element_count != 0) {
         // TODO: aggregate atomic adds (subgroup is probably fastest)
-        chunk_start = atomicAdd(alloc, element_count * BinInstance_size);
-        sh_chunk_start[gl_LocalInvocationID.x] = chunk_start;
+        chunk_alloc = malloc(element_count * BinInstance_size);
+        sh_chunk_start[gl_LocalInvocationID.x] = chunk_alloc.offset;
+        if (chunk_alloc.failed) {
+            sh_alloc_failed = true;
+        }
     }
     // Note: it might be more efficient for reading to do this in the
     // other order (each bin is a contiguous sequence of partitions)
-    uint out_ix = (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
-    bins[out_ix] = element_count;
-    bins[out_ix + 1] = chunk_start;
+    uint out_ix = (conf.bin_base >> 2) + (my_partition * N_TILE + gl_LocalInvocationID.x) * 2;
+    memory[out_ix] = element_count;
+    memory[out_ix + 1] = chunk_alloc.offset;
 
     barrier();
+    if (sh_alloc_failed) {
+        return;
+    }
+
     // Use similar strategy as Laine & Karras paper; loop over bbox of bins
     // touched by this element
     x = x0;
diff --git a/piet-gpu/shader/binning.spv b/piet-gpu/shader/binning.spv
index abe17d49..da2df762 100644
Binary files a/piet-gpu/shader/binning.spv and b/piet-gpu/shader/binning.spv differ
diff --git a/piet-gpu/shader/bins.h b/piet-gpu/shader/bins.h
index bc32dda0..43642785 100644
--- a/piet-gpu/shader/bins.h
+++ b/piet-gpu/shader/bins.h
@@ -18,7 +18,7 @@ BinInstanceRef BinInstance_index(BinInstanceRef ref, uint index) {
 
 BinInstance BinInstance_read(BinInstanceRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = bins[ix + 0];
+    uint raw0 = memory[ix + 0];
     BinInstance s;
     s.element_ix = raw0;
     return s;
@@ -26,6 +26,6 @@ BinInstance BinInstance_read(BinInstanceRef ref) {
 
 void BinInstance_write(BinInstanceRef ref, BinInstance s) {
     uint ix = ref.offset >> 2;
-    bins[ix + 0] = s.element_ix;
+    memory[ix + 0] = s.element_ix;
 }
 
diff --git a/piet-gpu/shader/coarse.comp b/piet-gpu/shader/coarse.comp
index a173608b..a70318ad 100644
--- a/piet-gpu/shader/coarse.comp
+++ b/piet-gpu/shader/coarse.comp
@@ -14,28 +14,12 @@
 #extension GL_GOOGLE_include_directive : enable
 
 #include "setup.h"
+#include "mem.h"
 
 layout(local_size_x = N_TILE, local_size_y = 1) in;
 
-layout(set = 0, binding = 0) buffer AnnotatedBuf {
-    uint[] annotated;
-};
-
-layout(set = 0, binding = 1) buffer BinsBuf {
-    uint[] bins;
-};
-
-layout(set = 0, binding = 2) buffer TileBuf {
-    uint[] tile;
-};
-
-layout(set = 0, binding = 3) buffer AllocBuf {
-    uint n_elements;
-    uint alloc;
-};
-
-layout(set = 0, binding = 4) buffer PtclBuf {
-    uint[] ptcl;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };
 
 #include "annotated.h"
@@ -65,22 +49,31 @@ shared uint sh_tile_base[N_TILE];
 shared uint sh_tile_stride[N_TILE];
 
 // Perhaps cmd_limit should be a global? This is a style question.
-void alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
-    if (cmd_ref.offset > cmd_limit) {
-        uint new_cmd = atomicAdd(alloc, PTCL_INITIAL_ALLOC);
-        CmdJump jump = CmdJump(new_cmd);
-        Cmd_Jump_write(cmd_ref, jump);
-        cmd_ref = CmdRef(new_cmd);
-        cmd_limit = new_cmd + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+bool alloc_cmd(inout CmdRef cmd_ref, inout uint cmd_limit) {
+    if (cmd_ref.offset < cmd_limit) {
+        return true;
     }
+    Alloc new_cmd = malloc(PTCL_INITIAL_ALLOC);
+    if (new_cmd.failed) {
+        return false;
+    }
+    CmdJump jump = CmdJump(new_cmd.offset);
+    Cmd_Jump_write(cmd_ref, jump);
+    cmd_ref = CmdRef(new_cmd.offset);
+    cmd_limit = new_cmd.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
+    return true;
 }
 
 void main() {
+    if (mem_overflow) {
+        return;
+    }
+
     // Could use either linear or 2d layouts for both dispatch and
     // invocations within the workgroup. We'll use variables to abstract.
     uint bin_ix = N_TILE_X * gl_WorkGroupID.y + gl_WorkGroupID.x;
     uint partition_ix = 0;
-    uint n_partitions = (n_elements + N_TILE - 1) / N_TILE;
+    uint n_partitions = (conf.n_elements + N_TILE - 1) / N_TILE;
     uint th_ix = gl_LocalInvocationID.x;
 
     // Coordinates of top left of bin, in tiles.
@@ -91,7 +84,7 @@ void main() {
     uint tile_x = gl_LocalInvocationID.x % N_TILE_X;
     uint tile_y = gl_LocalInvocationID.x / N_TILE_X;
     uint this_tile_ix = (bin_tile_y + tile_y) * WIDTH_IN_TILES + bin_tile_x + tile_x;
-    CmdRef cmd_ref = CmdRef(this_tile_ix * PTCL_INITIAL_ALLOC);
+    CmdRef cmd_ref = CmdRef(conf.ptcl_base + this_tile_ix * PTCL_INITIAL_ALLOC);
     uint cmd_limit = cmd_ref.offset + PTCL_INITIAL_ALLOC - 2 * Cmd_size;
     // The nesting depth of the clip stack
     uint clip_depth = 0;
@@ -123,9 +116,9 @@ void main() {
                 part_start_ix = ready_ix;
                 uint count = 0;
                 if (th_ix < N_PART_READ && partition_ix + th_ix < n_partitions) {
-                    uint in_ix = ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
-                    count = bins[in_ix];
-                    sh_part_elements[th_ix] = bins[in_ix + 1];
+                    uint in_ix = (conf.bin_base >> 2) + ((partition_ix + th_ix) * N_TILE + bin_ix) * 2;
+                    count = memory[in_ix];
+                    sh_part_elements[th_ix] = memory[in_ix + 1];
                 }
                 // prefix sum of counts
                 for (uint i = 0; i < LG_N_PART_READ; i++) {
@@ -175,7 +168,7 @@ void main() {
         AnnotatedRef ref;
         if (th_ix + rd_ix < wr_ix) {
             element_ix = sh_elements[th_ix];
-            ref = AnnotatedRef(element_ix * Annotated_size);
+            ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size);
             tag = Annotated_tag(ref);
         }
 
@@ -189,7 +182,7 @@ void main() {
             // We have one "path" for each element, even if the element isn't
             // actually a path (currently EndClip, but images etc in the future).
             uint path_ix = element_ix;
-            Path path = Path_read(PathRef(path_ix * Path_size));
+            Path path = Path_read(PathRef(conf.tile_base + path_ix * Path_size));
             uint stride = path.bbox.z - path.bbox.x;
             sh_tile_stride[th_ix] = stride;
             int dx = int(path.bbox.x) - int(bin_tile_x);
@@ -232,7 +225,7 @@ void main() {
                     el_ix = probe;
                 }
             }
-            AnnotatedRef ref = AnnotatedRef(sh_elements[el_ix] * Annotated_size);
+            AnnotatedRef ref = AnnotatedRef(conf.anno_base + sh_elements[el_ix] * Annotated_size);
             uint tag = Annotated_tag(ref);
             uint seq_ix = ix - (el_ix > 0 ? sh_tile_count[el_ix - 1] : 0);
             uint width = sh_tile_width[el_ix];
@@ -281,7 +274,7 @@ void main() {
             // At this point, we read the element again from global memory.
             // If that turns out to be expensive, maybe we can pack it into
             // shared memory (or perhaps just the tag).
-            ref = AnnotatedRef(element_ix * Annotated_size);
+            ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size);
             tag = Annotated_tag(ref);
 
             if (clip_zero_depth == 0) {
@@ -290,7 +283,9 @@ void main() {
                     Tile tile = Tile_read(TileRef(sh_tile_base[element_ref_ix]
                         + (sh_tile_stride[element_ref_ix] * tile_y + tile_x) * Tile_size));
                     AnnoFill fill = Annotated_Fill_read(ref);
-                    alloc_cmd(cmd_ref, cmd_limit);
+                    if (!alloc_cmd(cmd_ref, cmd_limit)) {
+                        break;
+                    }
                     if (tile.tile.offset != 0) {
                         CmdFill cmd_fill;
                         cmd_fill.tile_ref = tile.tile.offset;
@@ -310,7 +305,9 @@ void main() {
                     } else if (tile.tile.offset == 0 && clip_depth < 32) {
                         clip_one_mask |= (1 << clip_depth);
                     } else {
-                        alloc_cmd(cmd_ref, cmd_limit);
+                        if (!alloc_cmd(cmd_ref, cmd_limit)) {
+                            break;
+                        }
                         if (tile.tile.offset != 0) {
                             CmdBeginClip cmd_begin_clip;
                             cmd_begin_clip.tile_ref = tile.tile.offset;
@@ -331,7 +328,9 @@ void main() {
                 case Annotated_EndClip:
                     clip_depth--;
                     if (clip_depth >= 32 || (clip_one_mask & (1 << clip_depth)) == 0) {
-                        alloc_cmd(cmd_ref, cmd_limit);
+                        if (!alloc_cmd(cmd_ref, cmd_limit)) {
+                            break;
+                        }
                         Cmd_EndClip_write(cmd_ref, CmdEndClip(1.0));
                         cmd_ref.offset += Cmd_size;
                     }
@@ -344,7 +343,9 @@ void main() {
                     cmd_stroke.tile_ref = tile.tile.offset;
                     cmd_stroke.half_width = 0.5 * stroke.linewidth;
                     cmd_stroke.rgba_color = stroke.rgba_color;
-                    alloc_cmd(cmd_ref, cmd_limit);
+                    if (!alloc_cmd(cmd_ref, cmd_limit)) {
+                        break;
+                    }
                     Cmd_Stroke_write(cmd_ref, cmd_stroke);
                     cmd_ref.offset += Cmd_size;
                     break;
diff --git a/piet-gpu/shader/coarse.spv b/piet-gpu/shader/coarse.spv
index 215a97ab..260db696 100644
Binary files a/piet-gpu/shader/coarse.spv and b/piet-gpu/shader/coarse.spv differ
diff --git a/piet-gpu/shader/elements.comp b/piet-gpu/shader/elements.comp
index 5e8957f1..a0e50112 100644
--- a/piet-gpu/shader/elements.comp
+++ b/piet-gpu/shader/elements.comp
@@ -9,6 +9,9 @@
 #version 450
 #extension GL_GOOGLE_include_directive : enable
 
+#include "setup.h"
+#include "mem.h"
+
 #define N_ROWS 4
 #define WG_SIZE 32
 #define LG_WG_SIZE 5
@@ -16,28 +19,22 @@
 
 layout(local_size_x = WG_SIZE, local_size_y = 1) in;
 
-layout(set = 0, binding = 0) readonly buffer SceneBuf {
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
+};
+
+layout(set = 0, binding = 2) readonly buffer SceneBuf {
     uint[] scene;
 };
 
 // It would be better to use the Vulkan memory model than
 // "volatile" but shooting for compatibility here rather
 // than doing things right.
-layout(set = 0, binding = 1) volatile buffer StateBuf {
+layout(set = 0, binding = 3) volatile buffer StateBuf {
     uint part_counter;
     uint[] state;
 };
 
-// The annotated results are stored here.
-layout(set = 0, binding = 2) buffer AnnotatedBuf {
-    uint[] annotated;
-};
-
-// Path segments are stored here.
-layout(set = 0, binding = 3) buffer PathSegBuf {
-    uint[] pathseg;
-};
-
 #include "scene.h"
 #include "state.h"
 #include "annotated.h"
@@ -175,6 +172,10 @@ shared uint sh_part_ix;
 shared State sh_prefix;
 
 void main() {
+    if (mem_overflow) {
+        return;
+    }
+
     State th_state[N_ROWS];
     // Determine partition to process by atomic counter (described in Section
     // 4.4 of prefix sum paper).
@@ -341,9 +342,9 @@ void main() {
             }
             // We do encoding a bit by hand to minimize divergence. Another approach
             // would be to have a fill/stroke bool.
-            PathSegRef path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size);
+            PathSegRef path_out_ref = PathSegRef(conf.pathseg_base + (st.pathseg_count - 1) * PathSeg_size);
             uint out_tag = tag == Element_FillLine ? PathSeg_FillCubic : PathSeg_StrokeCubic;
-            pathseg[path_out_ref.offset >> 2] = out_tag;
+            memory[path_out_ref.offset >> 2] = out_tag;
             PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
             break;
         case Element_FillQuad:
@@ -365,9 +366,9 @@ void main() {
             }
             // We do encoding a bit by hand to minimize divergence. Another approach
             // would be to have a fill/stroke bool.
-            path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size);
+            path_out_ref = PathSegRef(conf.pathseg_base + (st.pathseg_count - 1) * PathSeg_size);
             out_tag = tag == Element_FillQuad ? PathSeg_FillCubic : PathSeg_StrokeCubic;
-            pathseg[path_out_ref.offset >> 2] = out_tag;
+            memory[path_out_ref.offset >> 2] = out_tag;
             PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
             break;
         case Element_FillCubic:
@@ -386,9 +387,9 @@ void main() {
             }
             // We do encoding a bit by hand to minimize divergence. Another approach
             // would be to have a fill/stroke bool.
-            path_out_ref = PathSegRef((st.pathseg_count - 1) * PathSeg_size);
+            path_out_ref = PathSegRef(conf.pathseg_base + (st.pathseg_count - 1) * PathSeg_size);
             out_tag = tag == Element_FillCubic ? PathSeg_FillCubic : PathSeg_StrokeCubic;
-            pathseg[path_out_ref.offset >> 2] = out_tag;
+            memory[path_out_ref.offset >> 2] = out_tag;
             PathStrokeCubic_write(PathStrokeCubicRef(path_out_ref.offset + 4), path_cubic);
             break;
         case Element_Stroke:
@@ -398,7 +399,7 @@ void main() {
             vec2 lw = get_linewidth(st);
             anno_stroke.bbox = st.bbox + vec4(-lw, lw);
             anno_stroke.linewidth = st.linewidth * sqrt(abs(st.mat.x * st.mat.w - st.mat.y * st.mat.z));
-            AnnotatedRef out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
+            AnnotatedRef out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size);
             Annotated_Stroke_write(out_ref, anno_stroke);
             break;
         case Element_Fill:
@@ -406,7 +407,7 @@ void main() {
             AnnoFill anno_fill;
             anno_fill.rgba_color = fill.rgba_color;
             anno_fill.bbox = st.bbox;
-            out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
+            out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size);
             Annotated_Fill_write(out_ref, anno_fill);
             break;
         case Element_BeginClip:
@@ -414,14 +415,14 @@ void main() {
             AnnoClip anno_begin_clip = AnnoClip(begin_clip.bbox);
             // This is the absolute bbox, it's been transformed during encoding.
             anno_begin_clip.bbox = begin_clip.bbox;
-            out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
+            out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size);
             Annotated_BeginClip_write(out_ref, anno_begin_clip);
             break;
         case Element_EndClip:
             Clip end_clip = Element_EndClip_read(this_ref);
             // This bbox is expected to be the same as the begin one.
             AnnoClip anno_end_clip = AnnoClip(end_clip.bbox);
-            out_ref = AnnotatedRef((st.path_count - 1) * Annotated_size);
+            out_ref = AnnotatedRef(conf.anno_base + (st.path_count - 1) * Annotated_size);
             Annotated_EndClip_write(out_ref, anno_end_clip);
             break;
         }
diff --git a/piet-gpu/shader/elements.spv b/piet-gpu/shader/elements.spv
index fd314c81..95171f82 100644
Binary files a/piet-gpu/shader/elements.spv and b/piet-gpu/shader/elements.spv differ
diff --git a/piet-gpu/shader/kernel4.comp b/piet-gpu/shader/kernel4.comp
index d01627ca..0183f332 100644
--- a/piet-gpu/shader/kernel4.comp
+++ b/piet-gpu/shader/kernel4.comp
@@ -11,50 +11,42 @@
 #extension GL_EXT_nonuniform_qualifier : enable
 
 #include "setup.h"
+#include "mem.h"
 
 #define CHUNK 8
 #define CHUNK_DY (TILE_HEIGHT_PX / CHUNK)
 layout(local_size_x = TILE_WIDTH_PX, local_size_y = CHUNK_DY) in;
 
-// Same concern that this should be readonly as in kernel 3.
-layout(set = 0, binding = 0) buffer PtclBuf {
-    uint[] ptcl;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };
 
-layout(set = 0, binding = 1) buffer TileBuf {
-    uint[] tile;
-};
-
-layout(set = 0, binding = 2) buffer ClipScratchBuf {
-    uint[] clip_scratch;
-};
-
-layout(rgba8, set = 0, binding = 3) uniform writeonly image2D image;
+layout(rgba8, set = 0, binding = 2) uniform writeonly image2D image;
 
-layout(set = 0, binding = 4) uniform sampler2D textures[];
+layout(set = 0, binding = 3) uniform sampler2D textures[];
 
 #include "ptcl.h"
 #include "tile.h"
 
 #define BLEND_STACK_SIZE 4
 
-// Layout of clip_scratch buffer:
-// [0] is the alloc bump offset (in units of 32 bit words, initially 0)
-// Starting at 1 is a sequence of frames.
+// Layout of a clip scratch frame:
 // Each frame is WIDTH * HEIGHT 32-bit words, then a link reference.
 
+// Link offset and frame size in 32-bit words.
 #define CLIP_LINK_OFFSET (TILE_WIDTH_PX * TILE_HEIGHT_PX)
 #define CLIP_BUF_SIZE (CLIP_LINK_OFFSET + 1)
 
-shared uint sh_clip_alloc;
+shared Alloc sh_clip_alloc;
 
-// Allocate a scratch buffer for clipping. Unlike offsets in the rest of the code,
-// it counts 32-bit words.
-uint alloc_clip_buf(uint link) {
+// Allocate a scratch buffer for clipping.
+Alloc alloc_clip_buf(uint link) {
     if (gl_LocalInvocationID.x == 0 && gl_LocalInvocationID.y == 0) {
-        uint alloc = atomicAdd(clip_scratch[0], CLIP_BUF_SIZE) + 1;
+        Alloc alloc = malloc(CLIP_BUF_SIZE * 4);
+        if (!alloc.failed) {
+            memory[(alloc.offset >> 2) + CLIP_LINK_OFFSET] = link;
+        }
         sh_clip_alloc = alloc;
-        clip_scratch[alloc + CLIP_LINK_OFFSET] = link;
     }
     barrier();
     return sh_clip_alloc;
@@ -95,8 +87,12 @@ float[CHUNK] computeArea(vec2 xy, int backdrop, uint tile_ref) {
 }
 
 void main() {
+    if (mem_overflow) {
+        return;
+    }
+
     uint tile_ix = gl_WorkGroupID.y * WIDTH_IN_TILES + gl_WorkGroupID.x;
-    CmdRef cmd_ref = CmdRef(tile_ix * PTCL_INITIAL_ALLOC);
+    CmdRef cmd_ref = CmdRef(conf.ptcl_base + tile_ix * PTCL_INITIAL_ALLOC);
 
     uvec2 xy_uint = uvec2(gl_GlobalInvocationID.x, gl_LocalInvocationID.y + TILE_HEIGHT_PX * gl_WorkGroupID.y);
     vec2 xy = vec2(xy_uint);
@@ -168,10 +164,14 @@ void main() {
             uint blend_slot = blend_sp % BLEND_STACK_SIZE;
             if (blend_sp == blend_spill + BLEND_STACK_SIZE) {
                 // spill to scratch buffer
-                clip_tos = alloc_clip_buf(clip_tos);
-                uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
+                Alloc alloc = alloc_clip_buf(clip_tos);
+                if (alloc.failed) {
+                    return;
+                }
+                clip_tos = alloc.offset;
+                uint base_ix = (clip_tos >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
                 for (uint k = 0; k < CHUNK; k++) {
-                    clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY] = blend_stack[blend_slot][k];
+                    memory[base_ix + k * TILE_WIDTH_PX * CHUNK_DY] = blend_stack[blend_slot][k];
                 }
                 blend_spill++;
             }
@@ -194,11 +194,11 @@ void main() {
             CmdEndClip end_clip = Cmd_EndClip_read(cmd_ref);
             blend_slot = (blend_sp - 1) % BLEND_STACK_SIZE;
             if (blend_sp == blend_spill) {
-                uint base_ix = clip_tos + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
+                uint base_ix = (clip_tos >> 2) + gl_LocalInvocationID.x + TILE_WIDTH_PX * gl_LocalInvocationID.y;
                 for (uint k = 0; k < CHUNK; k++) {
-                    blend_stack[blend_slot][k] = clip_scratch[base_ix + k * TILE_WIDTH_PX * CHUNK_DY];
+                    blend_stack[blend_slot][k] = memory[base_ix + k * TILE_WIDTH_PX * CHUNK_DY];
                 }
-                clip_tos = clip_scratch[clip_tos + CLIP_LINK_OFFSET];
+                clip_tos = memory[(clip_tos >> 2) + CLIP_LINK_OFFSET];
                 blend_spill--;
             }
             blend_sp--;
diff --git a/piet-gpu/shader/kernel4.spv b/piet-gpu/shader/kernel4.spv
index 33ed4f8d..f7acb7f2 100644
Binary files a/piet-gpu/shader/kernel4.spv and b/piet-gpu/shader/kernel4.spv differ
diff --git a/piet-gpu/shader/mem.h b/piet-gpu/shader/mem.h
new file mode 100644
index 00000000..9373cbfa
--- /dev/null
+++ b/piet-gpu/shader/mem.h
@@ -0,0 +1,29 @@
+// SPDX-License-Identifier: Apache-2.0 OR MIT OR Unlicense
+
+layout(set = 0, binding = 0) buffer Memory {
+    // offset into memory of the next allocation, initialized by the user.
+    uint mem_offset;
+    bool mem_overflow;
+    uint[] memory;
+};
+
+// Alloc represents a memory allocation.
+struct Alloc {
+    // offset in bytes into memory.
+    uint offset;
+    // failed is true if the allocation overflowed memory.
+    bool failed;
+};
+
+// malloc allocates size bytes of memory.
+Alloc malloc(uint size) {
+    Alloc a;
+	// Round up to nearest 32-bit word.
+	size = (size + 3) & ~3;
+    a.offset = atomicAdd(mem_offset, size);
+    a.failed = a.offset + size > memory.length() * 4;
+    if (a.failed) {
+        mem_overflow = true;
+    }
+    return a;
+}
diff --git a/piet-gpu/shader/path_coarse.comp b/piet-gpu/shader/path_coarse.comp
index cbca10fe..20c35866 100644
--- a/piet-gpu/shader/path_coarse.comp
+++ b/piet-gpu/shader/path_coarse.comp
@@ -8,24 +8,15 @@
 #extension GL_GOOGLE_include_directive : enable
 
 #include "setup.h"
+#include "mem.h"
 
 #define LG_COARSE_WG 5
 #define COARSE_WG (1 << LG_COARSE_WG)
 
 layout(local_size_x = COARSE_WG, local_size_y = 1) in;
 
-layout(set = 0, binding = 0) buffer PathSegBuf {
-    uint[] pathseg;
-};
-
-layout(set = 0, binding = 1) buffer AllocBuf {
-    uint n_paths;
-    uint n_pathseg;
-    uint alloc;
-};
-
-layout(set = 0, binding = 2) buffer TileBuf {
-    uint[] tile;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };
 
 #include "pathseg.h"
@@ -96,11 +87,15 @@ SubdivResult estimate_subdiv(vec2 p0, vec2 p1, vec2 p2, float sqrt_tol) {
 }
 
 void main() {
+    if (mem_overflow) {
+        return;
+    }
+
     uint element_ix = gl_GlobalInvocationID.x;
-    PathSegRef ref = PathSegRef(element_ix * PathSeg_size);
+    PathSegRef ref = PathSegRef(conf.pathseg_base + element_ix * PathSeg_size);
 
     uint tag = PathSeg_Nop;
-    if (element_ix < n_pathseg) {
+    if (element_ix < conf.n_pathseg) {
         tag = PathSeg_tag(ref);
     }
     switch (tag) {
@@ -128,7 +123,7 @@ void main() {
         uint n = max(uint(ceil(val * 0.5 / sqrt(REM_ACCURACY))), 1);
 
         uint path_ix = cubic.path_ix;
-        Path path = Path_read(PathRef(path_ix * Path_size));
+        Path path = Path_read(PathRef(conf.tile_base + path_ix * Path_size));
         ivec4 bbox = ivec4(path.bbox);
         vec2 p0 = cubic.p0;
         qp0 = cubic.p0;
@@ -187,7 +182,12 @@ void main() {
                 // TODO: can be tighter, use c to bound width
                 uint n_tile_alloc = uint((x1 - x0) * (y1 - y0));
                 // Consider using subgroups to aggregate atomic add.
-                uint tile_offset = atomicAdd(alloc, n_tile_alloc * TileSeg_size);
+                Alloc tile_alloc = malloc(n_tile_alloc * TileSeg_size);
+                if (tile_alloc.failed) {
+                    return;
+                }
+                uint tile_offset = tile_alloc.offset;
+
                 TileSeg tile_seg;
 
                 int xray = int(floor(p0.x*SX));
@@ -204,7 +204,7 @@ void main() {
                         int backdrop = p1.y < p0.y ? 1 : -1;
                         TileRef tile_ref = Tile_index(path.tiles, uint(base + xbackdrop));
                         uint tile_el = tile_ref.offset >> 2;
-                        atomicAdd(tile[tile_el + 1], backdrop);
+                        atomicAdd(memory[tile_el + 1], backdrop);
                     }
 
                     // next_xray is the xray for the next scanline; the line segment intersects
@@ -227,7 +227,7 @@ void main() {
                         float tile_x0 = float(x * TILE_WIDTH_PX);
                         TileRef tile_ref = Tile_index(path.tiles, uint(base + x));
                         uint tile_el = tile_ref.offset >> 2;
-                        uint old = atomicExchange(tile[tile_el], tile_offset);
+                        uint old = atomicExchange(memory[tile_el], tile_offset);
                         tile_seg.origin = p0;
                         tile_seg.vector = p1 - p0;
                         float y_edge = 0.0;
diff --git a/piet-gpu/shader/path_coarse.spv b/piet-gpu/shader/path_coarse.spv
index bec287b0..6b2e3b34 100644
Binary files a/piet-gpu/shader/path_coarse.spv and b/piet-gpu/shader/path_coarse.spv differ
diff --git a/piet-gpu/shader/pathseg.h b/piet-gpu/shader/pathseg.h
index 4ce6c460..ecba9c5d 100644
--- a/piet-gpu/shader/pathseg.h
+++ b/piet-gpu/shader/pathseg.h
@@ -89,11 +89,11 @@ PathSegRef PathSeg_index(PathSegRef ref, uint index) {
 
 PathFillLine PathFillLine_read(PathFillLineRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = pathseg[ix + 0];
-    uint raw1 = pathseg[ix + 1];
-    uint raw2 = pathseg[ix + 2];
-    uint raw3 = pathseg[ix + 3];
-    uint raw4 = pathseg[ix + 4];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
+    uint raw4 = memory[ix + 4];
     PathFillLine s;
     s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@@ -103,22 +103,22 @@ PathFillLine PathFillLine_read(PathFillLineRef ref) {
 
 void PathFillLine_write(PathFillLineRef ref, PathFillLine s) {
     uint ix = ref.offset >> 2;
-    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
-    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
-    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
-    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
-    pathseg[ix + 4] = s.path_ix;
+    memory[ix + 0] = floatBitsToUint(s.p0.x);
+    memory[ix + 1] = floatBitsToUint(s.p0.y);
+    memory[ix + 2] = floatBitsToUint(s.p1.x);
+    memory[ix + 3] = floatBitsToUint(s.p1.y);
+    memory[ix + 4] = s.path_ix;
 }
 
 PathStrokeLine PathStrokeLine_read(PathStrokeLineRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = pathseg[ix + 0];
-    uint raw1 = pathseg[ix + 1];
-    uint raw2 = pathseg[ix + 2];
-    uint raw3 = pathseg[ix + 3];
-    uint raw4 = pathseg[ix + 4];
-    uint raw5 = pathseg[ix + 5];
-    uint raw6 = pathseg[ix + 6];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
+    uint raw4 = memory[ix + 4];
+    uint raw5 = memory[ix + 5];
+    uint raw6 = memory[ix + 6];
     PathStrokeLine s;
     s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@@ -129,26 +129,26 @@ PathStrokeLine PathStrokeLine_read(PathStrokeLineRef ref) {
 
 void PathStrokeLine_write(PathStrokeLineRef ref, PathStrokeLine s) {
     uint ix = ref.offset >> 2;
-    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
-    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
-    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
-    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
-    pathseg[ix + 4] = s.path_ix;
-    pathseg[ix + 5] = floatBitsToUint(s.stroke.x);
-    pathseg[ix + 6] = floatBitsToUint(s.stroke.y);
+    memory[ix + 0] = floatBitsToUint(s.p0.x);
+    memory[ix + 1] = floatBitsToUint(s.p0.y);
+    memory[ix + 2] = floatBitsToUint(s.p1.x);
+    memory[ix + 3] = floatBitsToUint(s.p1.y);
+    memory[ix + 4] = s.path_ix;
+    memory[ix + 5] = floatBitsToUint(s.stroke.x);
+    memory[ix + 6] = floatBitsToUint(s.stroke.y);
 }
 
 PathFillCubic PathFillCubic_read(PathFillCubicRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = pathseg[ix + 0];
-    uint raw1 = pathseg[ix + 1];
-    uint raw2 = pathseg[ix + 2];
-    uint raw3 = pathseg[ix + 3];
-    uint raw4 = pathseg[ix + 4];
-    uint raw5 = pathseg[ix + 5];
-    uint raw6 = pathseg[ix + 6];
-    uint raw7 = pathseg[ix + 7];
-    uint raw8 = pathseg[ix + 8];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
+    uint raw4 = memory[ix + 4];
+    uint raw5 = memory[ix + 5];
+    uint raw6 = memory[ix + 6];
+    uint raw7 = memory[ix + 7];
+    uint raw8 = memory[ix + 8];
     PathFillCubic s;
     s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@@ -160,30 +160,30 @@ PathFillCubic PathFillCubic_read(PathFillCubicRef ref) {
 
 void PathFillCubic_write(PathFillCubicRef ref, PathFillCubic s) {
     uint ix = ref.offset >> 2;
-    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
-    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
-    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
-    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
-    pathseg[ix + 4] = floatBitsToUint(s.p2.x);
-    pathseg[ix + 5] = floatBitsToUint(s.p2.y);
-    pathseg[ix + 6] = floatBitsToUint(s.p3.x);
-    pathseg[ix + 7] = floatBitsToUint(s.p3.y);
-    pathseg[ix + 8] = s.path_ix;
+    memory[ix + 0] = floatBitsToUint(s.p0.x);
+    memory[ix + 1] = floatBitsToUint(s.p0.y);
+    memory[ix + 2] = floatBitsToUint(s.p1.x);
+    memory[ix + 3] = floatBitsToUint(s.p1.y);
+    memory[ix + 4] = floatBitsToUint(s.p2.x);
+    memory[ix + 5] = floatBitsToUint(s.p2.y);
+    memory[ix + 6] = floatBitsToUint(s.p3.x);
+    memory[ix + 7] = floatBitsToUint(s.p3.y);
+    memory[ix + 8] = s.path_ix;
 }
 
 PathStrokeCubic PathStrokeCubic_read(PathStrokeCubicRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = pathseg[ix + 0];
-    uint raw1 = pathseg[ix + 1];
-    uint raw2 = pathseg[ix + 2];
-    uint raw3 = pathseg[ix + 3];
-    uint raw4 = pathseg[ix + 4];
-    uint raw5 = pathseg[ix + 5];
-    uint raw6 = pathseg[ix + 6];
-    uint raw7 = pathseg[ix + 7];
-    uint raw8 = pathseg[ix + 8];
-    uint raw9 = pathseg[ix + 9];
-    uint raw10 = pathseg[ix + 10];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
+    uint raw4 = memory[ix + 4];
+    uint raw5 = memory[ix + 5];
+    uint raw6 = memory[ix + 6];
+    uint raw7 = memory[ix + 7];
+    uint raw8 = memory[ix + 8];
+    uint raw9 = memory[ix + 9];
+    uint raw10 = memory[ix + 10];
     PathStrokeCubic s;
     s.p0 = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.p1 = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@@ -196,21 +196,21 @@ PathStrokeCubic PathStrokeCubic_read(PathStrokeCubicRef ref) {
 
 void PathStrokeCubic_write(PathStrokeCubicRef ref, PathStrokeCubic s) {
     uint ix = ref.offset >> 2;
-    pathseg[ix + 0] = floatBitsToUint(s.p0.x);
-    pathseg[ix + 1] = floatBitsToUint(s.p0.y);
-    pathseg[ix + 2] = floatBitsToUint(s.p1.x);
-    pathseg[ix + 3] = floatBitsToUint(s.p1.y);
-    pathseg[ix + 4] = floatBitsToUint(s.p2.x);
-    pathseg[ix + 5] = floatBitsToUint(s.p2.y);
-    pathseg[ix + 6] = floatBitsToUint(s.p3.x);
-    pathseg[ix + 7] = floatBitsToUint(s.p3.y);
-    pathseg[ix + 8] = s.path_ix;
-    pathseg[ix + 9] = floatBitsToUint(s.stroke.x);
-    pathseg[ix + 10] = floatBitsToUint(s.stroke.y);
+    memory[ix + 0] = floatBitsToUint(s.p0.x);
+    memory[ix + 1] = floatBitsToUint(s.p0.y);
+    memory[ix + 2] = floatBitsToUint(s.p1.x);
+    memory[ix + 3] = floatBitsToUint(s.p1.y);
+    memory[ix + 4] = floatBitsToUint(s.p2.x);
+    memory[ix + 5] = floatBitsToUint(s.p2.y);
+    memory[ix + 6] = floatBitsToUint(s.p3.x);
+    memory[ix + 7] = floatBitsToUint(s.p3.y);
+    memory[ix + 8] = s.path_ix;
+    memory[ix + 9] = floatBitsToUint(s.stroke.x);
+    memory[ix + 10] = floatBitsToUint(s.stroke.y);
 }
 
 uint PathSeg_tag(PathSegRef ref) {
-    return pathseg[ref.offset >> 2];
+    return memory[ref.offset >> 2];
 }
 
 PathFillLine PathSeg_FillLine_read(PathSegRef ref) {
@@ -230,26 +230,26 @@ PathStrokeCubic PathSeg_StrokeCubic_read(PathSegRef ref) {
 }
 
 void PathSeg_Nop_write(PathSegRef ref) {
-    pathseg[ref.offset >> 2] = PathSeg_Nop;
+    memory[ref.offset >> 2] = PathSeg_Nop;
 }
 
 void PathSeg_FillLine_write(PathSegRef ref, PathFillLine s) {
-    pathseg[ref.offset >> 2] = PathSeg_FillLine;
+    memory[ref.offset >> 2] = PathSeg_FillLine;
     PathFillLine_write(PathFillLineRef(ref.offset + 4), s);
 }
 
 void PathSeg_StrokeLine_write(PathSegRef ref, PathStrokeLine s) {
-    pathseg[ref.offset >> 2] = PathSeg_StrokeLine;
+    memory[ref.offset >> 2] = PathSeg_StrokeLine;
     PathStrokeLine_write(PathStrokeLineRef(ref.offset + 4), s);
 }
 
 void PathSeg_FillCubic_write(PathSegRef ref, PathFillCubic s) {
-    pathseg[ref.offset >> 2] = PathSeg_FillCubic;
+    memory[ref.offset >> 2] = PathSeg_FillCubic;
     PathFillCubic_write(PathFillCubicRef(ref.offset + 4), s);
 }
 
 void PathSeg_StrokeCubic_write(PathSegRef ref, PathStrokeCubic s) {
-    pathseg[ref.offset >> 2] = PathSeg_StrokeCubic;
+    memory[ref.offset >> 2] = PathSeg_StrokeCubic;
     PathStrokeCubic_write(PathStrokeCubicRef(ref.offset + 4), s);
 }
 
diff --git a/piet-gpu/shader/ptcl.h b/piet-gpu/shader/ptcl.h
index 20b362ed..eb21eac0 100644
--- a/piet-gpu/shader/ptcl.h
+++ b/piet-gpu/shader/ptcl.h
@@ -173,10 +173,10 @@ CmdRef Cmd_index(CmdRef ref, uint index) {
 
 CmdCircle CmdCircle_read(CmdCircleRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    uint raw2 = ptcl[ix + 2];
-    uint raw3 = ptcl[ix + 3];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
     CmdCircle s;
     s.center = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.radius = uintBitsToFloat(raw2);
@@ -186,18 +186,18 @@ CmdCircle CmdCircle_read(CmdCircleRef ref) {
 
 void CmdCircle_write(CmdCircleRef ref, CmdCircle s) {
     uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.center.x);
-    ptcl[ix + 1] = floatBitsToUint(s.center.y);
-    ptcl[ix + 2] = floatBitsToUint(s.radius);
-    ptcl[ix + 3] = s.rgba_color;
+    memory[ix + 0] = floatBitsToUint(s.center.x);
+    memory[ix + 1] = floatBitsToUint(s.center.y);
+    memory[ix + 2] = floatBitsToUint(s.radius);
+    memory[ix + 3] = s.rgba_color;
 }
 
 CmdLine CmdLine_read(CmdLineRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    uint raw2 = ptcl[ix + 2];
-    uint raw3 = ptcl[ix + 3];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
     CmdLine s;
     s.start = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.end = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@@ -206,17 +206,17 @@ CmdLine CmdLine_read(CmdLineRef ref) {
 
 void CmdLine_write(CmdLineRef ref, CmdLine s) {
     uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.start.x);
-    ptcl[ix + 1] = floatBitsToUint(s.start.y);
-    ptcl[ix + 2] = floatBitsToUint(s.end.x);
-    ptcl[ix + 3] = floatBitsToUint(s.end.y);
+    memory[ix + 0] = floatBitsToUint(s.start.x);
+    memory[ix + 1] = floatBitsToUint(s.start.y);
+    memory[ix + 2] = floatBitsToUint(s.end.x);
+    memory[ix + 3] = floatBitsToUint(s.end.y);
 }
 
 CmdStroke CmdStroke_read(CmdStrokeRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    uint raw2 = ptcl[ix + 2];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
     CmdStroke s;
     s.tile_ref = raw0;
     s.half_width = uintBitsToFloat(raw1);
@@ -226,16 +226,16 @@ CmdStroke CmdStroke_read(CmdStrokeRef ref) {
 
 void CmdStroke_write(CmdStrokeRef ref, CmdStroke s) {
     uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.tile_ref;
-    ptcl[ix + 1] = floatBitsToUint(s.half_width);
-    ptcl[ix + 2] = s.rgba_color;
+    memory[ix + 0] = s.tile_ref;
+    memory[ix + 1] = floatBitsToUint(s.half_width);
+    memory[ix + 2] = s.rgba_color;
 }
 
 CmdFill CmdFill_read(CmdFillRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
-    uint raw2 = ptcl[ix + 2];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
     CmdFill s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
@@ -245,15 +245,15 @@ CmdFill CmdFill_read(CmdFillRef ref) {
 
 void CmdFill_write(CmdFillRef ref, CmdFill s) {
     uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.tile_ref;
-    ptcl[ix + 1] = uint(s.backdrop);
-    ptcl[ix + 2] = s.rgba_color;
+    memory[ix + 0] = s.tile_ref;
+    memory[ix + 1] = uint(s.backdrop);
+    memory[ix + 2] = s.rgba_color;
 }
 
 CmdBeginClip CmdBeginClip_read(CmdBeginClipRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
-    uint raw1 = ptcl[ix + 1];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
     CmdBeginClip s;
     s.tile_ref = raw0;
     s.backdrop = int(raw1);
@@ -262,13 +262,13 @@ CmdBeginClip CmdBeginClip_read(CmdBeginClipRef ref) {
 
 void CmdBeginClip_write(CmdBeginClipRef ref, CmdBeginClip s) {
     uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.tile_ref;
-    ptcl[ix + 1] = uint(s.backdrop);
+    memory[ix + 0] = s.tile_ref;
+    memory[ix + 1] = uint(s.backdrop);
 }
 
 CmdBeginSolidClip CmdBeginSolidClip_read(CmdBeginSolidClipRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
+    uint raw0 = memory[ix + 0];
     CmdBeginSolidClip s;
     s.alpha = uintBitsToFloat(raw0);
     return s;
@@ -276,12 +276,12 @@ CmdBeginSolidClip CmdBeginSolidClip_read(CmdBeginSolidClipRef ref) {
 
 void CmdBeginSolidClip_write(CmdBeginSolidClipRef ref, CmdBeginSolidClip s) {
     uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.alpha);
+    memory[ix + 0] = floatBitsToUint(s.alpha);
 }
 
 CmdEndClip CmdEndClip_read(CmdEndClipRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
+    uint raw0 = memory[ix + 0];
     CmdEndClip s;
     s.alpha = uintBitsToFloat(raw0);
     return s;
@@ -289,12 +289,12 @@ CmdEndClip CmdEndClip_read(CmdEndClipRef ref) {
 
 void CmdEndClip_write(CmdEndClipRef ref, CmdEndClip s) {
     uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.alpha);
+    memory[ix + 0] = floatBitsToUint(s.alpha);
 }
 
 CmdSolid CmdSolid_read(CmdSolidRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
+    uint raw0 = memory[ix + 0];
     CmdSolid s;
     s.rgba_color = raw0;
     return s;
@@ -302,12 +302,12 @@ CmdSolid CmdSolid_read(CmdSolidRef ref) {
 
 void CmdSolid_write(CmdSolidRef ref, CmdSolid s) {
     uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.rgba_color;
+    memory[ix + 0] = s.rgba_color;
 }
 
 CmdSolidMask CmdSolidMask_read(CmdSolidMaskRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
+    uint raw0 = memory[ix + 0];
     CmdSolidMask s;
     s.mask = uintBitsToFloat(raw0);
     return s;
@@ -315,12 +315,12 @@ CmdSolidMask CmdSolidMask_read(CmdSolidMaskRef ref) {
 
 void CmdSolidMask_write(CmdSolidMaskRef ref, CmdSolidMask s) {
     uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = floatBitsToUint(s.mask);
+    memory[ix + 0] = floatBitsToUint(s.mask);
 }
 
 CmdJump CmdJump_read(CmdJumpRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = ptcl[ix + 0];
+    uint raw0 = memory[ix + 0];
     CmdJump s;
     s.new_ref = raw0;
     return s;
@@ -328,11 +328,11 @@ CmdJump CmdJump_read(CmdJumpRef ref) {
 
 void CmdJump_write(CmdJumpRef ref, CmdJump s) {
     uint ix = ref.offset >> 2;
-    ptcl[ix + 0] = s.new_ref;
+    memory[ix + 0] = s.new_ref;
 }
 
 uint Cmd_tag(CmdRef ref) {
-    return ptcl[ref.offset >> 2];
+    return memory[ref.offset >> 2];
 }
 
 CmdCircle Cmd_Circle_read(CmdRef ref) {
@@ -376,56 +376,56 @@ CmdJump Cmd_Jump_read(CmdRef ref) {
 }
 
 void Cmd_End_write(CmdRef ref) {
-    ptcl[ref.offset >> 2] = Cmd_End;
+    memory[ref.offset >> 2] = Cmd_End;
 }
 
 void Cmd_Circle_write(CmdRef ref, CmdCircle s) {
-    ptcl[ref.offset >> 2] = Cmd_Circle;
+    memory[ref.offset >> 2] = Cmd_Circle;
     CmdCircle_write(CmdCircleRef(ref.offset + 4), s);
 }
 
 void Cmd_Line_write(CmdRef ref, CmdLine s) {
-    ptcl[ref.offset >> 2] = Cmd_Line;
+    memory[ref.offset >> 2] = Cmd_Line;
     CmdLine_write(CmdLineRef(ref.offset + 4), s);
 }
 
 void Cmd_Fill_write(CmdRef ref, CmdFill s) {
-    ptcl[ref.offset >> 2] = Cmd_Fill;
+    memory[ref.offset >> 2] = Cmd_Fill;
     CmdFill_write(CmdFillRef(ref.offset + 4), s);
 }
 
 void Cmd_BeginClip_write(CmdRef ref, CmdBeginClip s) {
-    ptcl[ref.offset >> 2] = Cmd_BeginClip;
+    memory[ref.offset >> 2] = Cmd_BeginClip;
     CmdBeginClip_write(CmdBeginClipRef(ref.offset + 4), s);
 }
 
 void Cmd_BeginSolidClip_write(CmdRef ref, CmdBeginSolidClip s) {
-    ptcl[ref.offset >> 2] = Cmd_BeginSolidClip;
+    memory[ref.offset >> 2] = Cmd_BeginSolidClip;
     CmdBeginSolidClip_write(CmdBeginSolidClipRef(ref.offset + 4), s);
 }
 
 void Cmd_EndClip_write(CmdRef ref, CmdEndClip s) {
-    ptcl[ref.offset >> 2] = Cmd_EndClip;
+    memory[ref.offset >> 2] = Cmd_EndClip;
     CmdEndClip_write(CmdEndClipRef(ref.offset + 4), s);
 }
 
 void Cmd_Stroke_write(CmdRef ref, CmdStroke s) {
-    ptcl[ref.offset >> 2] = Cmd_Stroke;
+    memory[ref.offset >> 2] = Cmd_Stroke;
     CmdStroke_write(CmdStrokeRef(ref.offset + 4), s);
 }
 
 void Cmd_Solid_write(CmdRef ref, CmdSolid s) {
-    ptcl[ref.offset >> 2] = Cmd_Solid;
+    memory[ref.offset >> 2] = Cmd_Solid;
     CmdSolid_write(CmdSolidRef(ref.offset + 4), s);
 }
 
 void Cmd_SolidMask_write(CmdRef ref, CmdSolidMask s) {
-    ptcl[ref.offset >> 2] = Cmd_SolidMask;
+    memory[ref.offset >> 2] = Cmd_SolidMask;
     CmdSolidMask_write(CmdSolidMaskRef(ref.offset + 4), s);
 }
 
 void Cmd_Jump_write(CmdRef ref, CmdJump s) {
-    ptcl[ref.offset >> 2] = Cmd_Jump;
+    memory[ref.offset >> 2] = Cmd_Jump;
     CmdJump_write(CmdJumpRef(ref.offset + 4), s);
 }
 
diff --git a/piet-gpu/shader/setup.h b/piet-gpu/shader/setup.h
index 6998a16f..9a7d580d 100644
--- a/piet-gpu/shader/setup.h
+++ b/piet-gpu/shader/setup.h
@@ -28,3 +28,13 @@
 #define N_TILE (N_TILE_X * N_TILE_Y)
 #define LG_N_TILE (7 + LG_WG_FACTOR)
 #define N_SLICE (N_TILE / 32)
+
+struct Config {
+    uint n_elements; // paths
+    uint n_pathseg;
+    uint tile_base;
+    uint bin_base;
+    uint ptcl_base;
+    uint pathseg_base;
+    uint anno_base;
+};
diff --git a/piet-gpu/shader/tile.h b/piet-gpu/shader/tile.h
index a33cb5ab..133ff53e 100644
--- a/piet-gpu/shader/tile.h
+++ b/piet-gpu/shader/tile.h
@@ -51,9 +51,9 @@ TileSegRef TileSeg_index(TileSegRef ref, uint index) {
 
 Path Path_read(PathRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = tile[ix + 0];
-    uint raw1 = tile[ix + 1];
-    uint raw2 = tile[ix + 2];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
     Path s;
     s.bbox = uvec4(raw0 & 0xffff, raw0 >> 16, raw1 & 0xffff, raw1 >> 16);
     s.tiles = TileRef(raw2);
@@ -62,15 +62,15 @@ Path Path_read(PathRef ref) {
 
 void Path_write(PathRef ref, Path s) {
     uint ix = ref.offset >> 2;
-    tile[ix + 0] = s.bbox.x | (s.bbox.y << 16);
-    tile[ix + 1] = s.bbox.z | (s.bbox.w << 16);
-    tile[ix + 2] = s.tiles.offset;
+    memory[ix + 0] = s.bbox.x | (s.bbox.y << 16);
+    memory[ix + 1] = s.bbox.z | (s.bbox.w << 16);
+    memory[ix + 2] = s.tiles.offset;
 }
 
 Tile Tile_read(TileRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = tile[ix + 0];
-    uint raw1 = tile[ix + 1];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
     Tile s;
     s.tile = TileSegRef(raw0);
     s.backdrop = int(raw1);
@@ -79,18 +79,18 @@ Tile Tile_read(TileRef ref) {
 
 void Tile_write(TileRef ref, Tile s) {
     uint ix = ref.offset >> 2;
-    tile[ix + 0] = s.tile.offset;
-    tile[ix + 1] = uint(s.backdrop);
+    memory[ix + 0] = s.tile.offset;
+    memory[ix + 1] = uint(s.backdrop);
 }
 
 TileSeg TileSeg_read(TileSegRef ref) {
     uint ix = ref.offset >> 2;
-    uint raw0 = tile[ix + 0];
-    uint raw1 = tile[ix + 1];
-    uint raw2 = tile[ix + 2];
-    uint raw3 = tile[ix + 3];
-    uint raw4 = tile[ix + 4];
-    uint raw5 = tile[ix + 5];
+    uint raw0 = memory[ix + 0];
+    uint raw1 = memory[ix + 1];
+    uint raw2 = memory[ix + 2];
+    uint raw3 = memory[ix + 3];
+    uint raw4 = memory[ix + 4];
+    uint raw5 = memory[ix + 5];
     TileSeg s;
     s.origin = vec2(uintBitsToFloat(raw0), uintBitsToFloat(raw1));
     s.vector = vec2(uintBitsToFloat(raw2), uintBitsToFloat(raw3));
@@ -101,11 +101,11 @@ TileSeg TileSeg_read(TileSegRef ref) {
 
 void TileSeg_write(TileSegRef ref, TileSeg s) {
     uint ix = ref.offset >> 2;
-    tile[ix + 0] = floatBitsToUint(s.origin.x);
-    tile[ix + 1] = floatBitsToUint(s.origin.y);
-    tile[ix + 2] = floatBitsToUint(s.vector.x);
-    tile[ix + 3] = floatBitsToUint(s.vector.y);
-    tile[ix + 4] = floatBitsToUint(s.y_edge);
-    tile[ix + 5] = s.next.offset;
+    memory[ix + 0] = floatBitsToUint(s.origin.x);
+    memory[ix + 1] = floatBitsToUint(s.origin.y);
+    memory[ix + 2] = floatBitsToUint(s.vector.x);
+    memory[ix + 3] = floatBitsToUint(s.vector.y);
+    memory[ix + 4] = floatBitsToUint(s.y_edge);
+    memory[ix + 5] = s.next.offset;
 }
 
diff --git a/piet-gpu/shader/tile_alloc.comp b/piet-gpu/shader/tile_alloc.comp
index 64529d1c..3280f7ff 100644
--- a/piet-gpu/shader/tile_alloc.comp
+++ b/piet-gpu/shader/tile_alloc.comp
@@ -6,24 +6,15 @@
 #extension GL_GOOGLE_include_directive : enable
 
 #include "setup.h"
+#include "mem.h"
 
 #define LG_TILE_ALLOC_WG (7 + LG_WG_FACTOR)
 #define TILE_ALLOC_WG (1 << LG_TILE_ALLOC_WG)
 
 layout(local_size_x = TILE_ALLOC_WG, local_size_y = 1) in;
 
-layout(set = 0, binding = 0) buffer AnnotatedBuf {
-    uint[] annotated;
-};
-
-layout(set = 0, binding = 1) buffer AllocBuf {
-    uint n_elements;
-    uint n_pathseg;
-    uint alloc;
-};
-
-layout(set = 0, binding = 2) buffer TileBuf {
-    uint[] tile;
+layout(set = 0, binding = 1) readonly buffer ConfigBuf {
+    Config conf;
 };
 
 #include "annotated.h"
@@ -34,16 +25,20 @@ layout(set = 0, binding = 2) buffer TileBuf {
 #define SY (1.0 / float(TILE_HEIGHT_PX))
 
 shared uint sh_tile_count[TILE_ALLOC_WG];
-shared uint sh_tile_alloc;
+shared Alloc sh_tile_alloc;
 
 void main() {
+    if (mem_overflow) {
+        return;
+    }
+
     uint th_ix = gl_LocalInvocationID.x;
     uint element_ix = gl_GlobalInvocationID.x;
-    PathRef path_ref = PathRef(element_ix * Path_size);
-    AnnotatedRef ref = AnnotatedRef(element_ix * Annotated_size);
+    PathRef path_ref = PathRef(conf.tile_base + element_ix * Path_size);
+    AnnotatedRef ref = AnnotatedRef(conf.anno_base + element_ix * Annotated_size);
 
     uint tag = Annotated_Nop;
-    if (element_ix < n_elements) {
+    if (element_ix < conf.n_elements) {
         tag = Annotated_tag(ref);
     }
     int x0 = 0, y0 = 0, x1 = 0, y1 = 0;
@@ -86,23 +81,26 @@ void main() {
         sh_tile_count[th_ix] = tile_count;
     }
     if (th_ix == TILE_ALLOC_WG - 1) {
-        sh_tile_alloc = atomicAdd(alloc, tile_count * Tile_size);
+        sh_tile_alloc = malloc(tile_count * Tile_size);
     }
     barrier();
-    uint alloc_start = sh_tile_alloc;
+    Alloc alloc_start = sh_tile_alloc;
+    if (alloc_start.failed) {
+        return;
+    }
 
-    if (element_ix < n_elements) {
+    if (element_ix < conf.n_elements) {
         uint tile_subix = th_ix > 0 ? sh_tile_count[th_ix - 1] : 0;
-        path.tiles = TileRef(alloc_start + Tile_size * tile_subix);
+        path.tiles = TileRef(alloc_start.offset + Tile_size * tile_subix);
         Path_write(path_ref, path);
     }
 
     // Zero out allocated tiles efficiently
     uint total_count = sh_tile_count[TILE_ALLOC_WG - 1] * (Tile_size / 4);
-    uint start_ix = alloc_start >> 2;
+    uint start_ix = alloc_start.offset >> 2;
     for (uint i = th_ix; i < total_count; i += TILE_ALLOC_WG) {
         // Note: this interleaving is faster than using Tile_write
         // by a significant amount.
-        tile[start_ix + i] = 0;
+        memory[start_ix + i] = 0;
     }
 }
diff --git a/piet-gpu/shader/tile_alloc.spv b/piet-gpu/shader/tile_alloc.spv
index e901bad1..e407222c 100644
Binary files a/piet-gpu/shader/tile_alloc.spv and b/piet-gpu/shader/tile_alloc.spv differ
diff --git a/piet-gpu/src/lib.rs b/piet-gpu/src/lib.rs
index 18688886..02726648 100644
--- a/piet-gpu/src/lib.rs
+++ b/piet-gpu/src/lib.rs
@@ -156,15 +156,16 @@ pub fn dump_k1_data(k1_buf: &[u32]) {
 pub struct Renderer {
     pub image_dev: hub::Image, // resulting image
 
-    scene_buf: hub::Buffer,
-    scene_dev: hub::Buffer,
+    scene_buf_host: hub::Buffer,
+    scene_buf_dev: hub::Buffer,
 
-    pub state_buf: hub::Buffer,
-    pub anno_buf: hub::Buffer,
-    pub pathseg_buf: hub::Buffer,
-    pub tile_buf: hub::Buffer,
-    pub bin_buf: hub::Buffer,
-    pub ptcl_buf: hub::Buffer,
+    memory_buf_host: hub::Buffer,
+    memory_buf_dev: hub::Buffer,
+
+    state_buf: hub::Buffer,
+
+    config_buf_host: hub::Buffer,
+    config_buf_dev: hub::Buffer,
 
     el_pipeline: hub::Pipeline,
     el_ds: hub::DescriptorSet,
@@ -178,23 +179,12 @@ pub struct Renderer {
     backdrop_pipeline: hub::Pipeline,
     backdrop_ds: hub::DescriptorSet,
 
-    tile_alloc_buf_host: hub::Buffer,
-    tile_alloc_buf_dev: hub::Buffer,
-
     bin_pipeline: hub::Pipeline,
     bin_ds: hub::DescriptorSet,
 
-    bin_alloc_buf_host: hub::Buffer,
-    bin_alloc_buf_dev: hub::Buffer,
-
     coarse_pipeline: hub::Pipeline,
     coarse_ds: hub::DescriptorSet,
 
-    coarse_alloc_buf_host: hub::Buffer,
-    coarse_alloc_buf_dev: hub::Buffer,
-
-    clip_scratch_buf: hub::Buffer,
-
     k4_pipeline: hub::Pipeline,
     k4_ds: hub::DescriptorSet,
 
@@ -221,88 +211,83 @@ impl Renderer {
             n_elements, n_paths, n_pathseg
         );
 
-        let mut scene_buf = session
+        let mut scene_buf_host = session
             .create_buffer(std::mem::size_of_val(&scene[..]) as u64, host)
             .unwrap();
-        let scene_dev = session
+        let scene_buf_dev = session
             .create_buffer(std::mem::size_of_val(&scene[..]) as u64, dev)
             .unwrap();
-        scene_buf.write(&scene)?;
+        scene_buf_host.write(&scene)?;
 
         let state_buf = session.create_buffer(1 * 1024 * 1024, dev)?;
-        let anno_buf = session.create_buffer(64 * 1024 * 1024, dev)?;
-        let pathseg_buf = session.create_buffer(64 * 1024 * 1024, dev)?;
-        let tile_buf = session.create_buffer(64 * 1024 * 1024, dev)?;
-        let bin_buf = session.create_buffer(64 * 1024 * 1024, dev)?;
-        let ptcl_buf = session.create_buffer(48 * 1024 * 1024, dev)?;
         let image_dev = session.create_image2d(WIDTH as u32, HEIGHT as u32, dev)?;
 
+        let mut config_buf_host = session.create_buffer(7*4, host)?;
+        let config_buf_dev = session.create_buffer(7*4, dev)?;
+
+        // TODO: constants
+        const PATH_SIZE: usize = 12;
+        const BIN_SIZE: usize = 8;
+        const PATHSEG_SIZE: usize = 48;
+        const ANNO_SIZE: usize = 28;
+        let mut alloc = 0;
+        let tile_base = alloc;
+        alloc += ((n_paths + 3) & !3) * PATH_SIZE;
+        let bin_base = alloc;
+        alloc += ((n_paths + 255) & !255) * BIN_SIZE;
+        let ptcl_base = alloc;
+        alloc += WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
+        let pathseg_base = alloc;
+        alloc += (n_pathseg * PATHSEG_SIZE + 3) & !3;
+        let anno_base = alloc;
+        alloc += (n_paths * ANNO_SIZE + 3) & !3;
+        config_buf_host.write(&[n_paths as u32, n_pathseg as u32, tile_base as u32, bin_base as u32, ptcl_base as u32, pathseg_base as u32, anno_base as u32])?;
+
+        let mut memory_buf_host = session.create_buffer(2*4, host)?;
+        let memory_buf_dev = session.create_buffer(128 * 1024 * 1024, dev)?;
+        memory_buf_host.write(&[alloc as u32, 0 /* Overflow flag */])?;
+
         let el_code = include_bytes!("../shader/elements.spv");
         let el_pipeline = session.create_simple_compute_pipeline(el_code, 4)?;
         let el_ds = session.create_simple_descriptor_set(
             &el_pipeline,
-            &[&scene_dev, &state_buf, &anno_buf, &pathseg_buf],
+            &[&memory_buf_dev, &config_buf_dev, &scene_buf_dev, &state_buf],
         )?;
 
-        let mut tile_alloc_buf_host = session.create_buffer(12, host)?;
-        let tile_alloc_buf_dev = session.create_buffer(12, dev)?;
-
-        // TODO: constants
-        const PATH_SIZE: usize = 12;
-        let tile_alloc_start = ((n_paths + 31) & !31) * PATH_SIZE;
-        tile_alloc_buf_host.write(&[n_paths as u32, n_pathseg as u32, tile_alloc_start as u32])?;
         let tile_alloc_code = include_bytes!("../shader/tile_alloc.spv");
-        let tile_pipeline = session.create_simple_compute_pipeline(tile_alloc_code, 3)?;
+        let tile_pipeline = session.create_simple_compute_pipeline(tile_alloc_code, 2)?;
         let tile_ds = session.create_simple_descriptor_set(
             &tile_pipeline,
-            &[&anno_buf, &tile_alloc_buf_dev, &tile_buf],
+            &[&memory_buf_dev, &config_buf_dev],
         )?;
 
         let path_alloc_code = include_bytes!("../shader/path_coarse.spv");
-        let path_pipeline = session.create_simple_compute_pipeline(path_alloc_code, 3)?;
+        let path_pipeline = session.create_simple_compute_pipeline(path_alloc_code, 2)?;
         let path_ds = session.create_simple_descriptor_set(
             &path_pipeline,
-            &[&pathseg_buf, &tile_alloc_buf_dev, &tile_buf],
+            &[&memory_buf_dev, &config_buf_dev],
         )?;
 
         let backdrop_alloc_code = include_bytes!("../shader/backdrop.spv");
-        let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 3)?;
+        let backdrop_pipeline = session.create_simple_compute_pipeline(backdrop_alloc_code, 2)?;
         let backdrop_ds = session.create_simple_descriptor_set(
             &backdrop_pipeline,
-            &[&anno_buf, &tile_alloc_buf_dev, &tile_buf],
+            &[&memory_buf_dev, &config_buf_dev],
         )?;
 
-        let mut bin_alloc_buf_host = session.create_buffer(8, host)?;
-        let bin_alloc_buf_dev = session.create_buffer(8, dev)?;
-
         // TODO: constants
-        let bin_alloc_start = ((n_paths + 255) & !255) * 8;
-        bin_alloc_buf_host.write(&[n_paths as u32, bin_alloc_start as u32])?;
         let bin_code = include_bytes!("../shader/binning.spv");
-        let bin_pipeline = session.create_simple_compute_pipeline(bin_code, 3)?;
+        let bin_pipeline = session.create_simple_compute_pipeline(bin_code, 2)?;
         let bin_ds = session.create_simple_descriptor_set(
             &bin_pipeline,
-            &[&anno_buf, &bin_alloc_buf_dev, &bin_buf],
+            &[&memory_buf_dev, &config_buf_dev],
         )?;
 
-        let clip_scratch_buf = session.create_buffer(1024 * 1024, dev)?;
-
-        let mut coarse_alloc_buf_host = session.create_buffer(8, host)?;
-        let coarse_alloc_buf_dev = session.create_buffer(8, dev)?;
-
-        let coarse_alloc_start = WIDTH_IN_TILES * HEIGHT_IN_TILES * PTCL_INITIAL_ALLOC;
-        coarse_alloc_buf_host.write(&[n_paths as u32, coarse_alloc_start as u32])?;
         let coarse_code = include_bytes!("../shader/coarse.spv");
-        let coarse_pipeline = session.create_simple_compute_pipeline(coarse_code, 5)?;
+        let coarse_pipeline = session.create_simple_compute_pipeline(coarse_code, 2)?;
         let coarse_ds = session.create_simple_descriptor_set(
             &coarse_pipeline,
-            &[
-                &anno_buf,
-                &bin_buf,
-                &tile_buf,
-                &coarse_alloc_buf_dev,
-                &ptcl_buf,
-            ],
+            &[&memory_buf_dev, &config_buf_dev],
         )?;
 
         let bg_image = Self::make_test_bg_image(&session);
@@ -318,20 +303,25 @@ impl Renderer {
         let sampler = session.create_sampler(SamplerParams::Linear)?;
         let k4_pipeline = session
             .pipeline_builder()
-            .add_buffers(3)
+            .add_buffers(2)
             .add_images(1)
             .add_textures(max_textures)
             .create_compute_pipeline(&session, k4_code)?;
         let k4_ds = session
             .descriptor_set_builder()
-            .add_buffers(&[&ptcl_buf, &tile_buf, &clip_scratch_buf])
+            .add_buffers(&[&memory_buf_dev, &config_buf_dev])
             .add_images(&[&image_dev])
             .add_textures(&[&bg_image], &sampler)
             .build(&session, &k4_pipeline)?;
 
         Ok(Renderer {
-            scene_buf,
-            scene_dev,
+            scene_buf_host,
+            scene_buf_dev,
+            memory_buf_host,
+            memory_buf_dev,
+            state_buf,
+            config_buf_host,
+            config_buf_dev,
             image_dev,
             el_pipeline,
             el_ds,
@@ -347,19 +337,6 @@ impl Renderer {
             coarse_ds,
             k4_pipeline,
             k4_ds,
-            state_buf,
-            anno_buf,
-            pathseg_buf,
-            tile_buf,
-            bin_buf,
-            ptcl_buf,
-            tile_alloc_buf_host,
-            tile_alloc_buf_dev,
-            bin_alloc_buf_host,
-            bin_alloc_buf_dev,
-            coarse_alloc_buf_host,
-            coarse_alloc_buf_dev,
-            clip_scratch_buf,
             n_elements,
             n_paths,
             n_pathseg,
@@ -368,21 +345,16 @@ impl Renderer {
     }
 
     pub unsafe fn record(&self, cmd_buf: &mut hub::CmdBuf, query_pool: &hub::QueryPool) {
-        cmd_buf.copy_buffer(self.scene_buf.vk_buffer(), self.scene_dev.vk_buffer());
-        cmd_buf.copy_buffer(
-            self.tile_alloc_buf_host.vk_buffer(),
-            self.tile_alloc_buf_dev.vk_buffer(),
-        );
+        cmd_buf.copy_buffer(self.scene_buf_host.vk_buffer(), self.scene_buf_dev.vk_buffer());
         cmd_buf.copy_buffer(
-            self.bin_alloc_buf_host.vk_buffer(),
-            self.bin_alloc_buf_dev.vk_buffer(),
+            self.config_buf_host.vk_buffer(),
+            self.config_buf_dev.vk_buffer(),
         );
         cmd_buf.copy_buffer(
-            self.coarse_alloc_buf_host.vk_buffer(),
-            self.coarse_alloc_buf_dev.vk_buffer(),
+            self.memory_buf_host.vk_buffer(),
+            self.memory_buf_dev.vk_buffer(),
         );
         cmd_buf.clear_buffer(self.state_buf.vk_buffer(), None);
-        cmd_buf.clear_buffer(self.clip_scratch_buf.vk_buffer(), Some(4));
         cmd_buf.memory_barrier();
         cmd_buf.image_barrier(
             self.image_dev.vk_image(),