diff --git a/.gitmodules b/.gitmodules
index c865dca..0ae42d4 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,7 +7,7 @@
 [submodule "ext/bulk"]
 	path = ext/bulk
 	url = https://github.com/jwbuurlage/Bulk.git
-        branch = array_queue
+        branch = develop
 [submodule "catch"]
 	path = ext/catch
 	url = https://github.com/philsquared/Catch
diff --git a/distributed/overlaps.hpp b/distributed/overlaps.hpp
index 034394b..859bf92 100644
--- a/distributed/overlaps.hpp
+++ b/distributed/overlaps.hpp
@@ -15,6 +15,21 @@ struct overlap {
     distributed::shadow region;
 };
 
+struct pod_shadow {
+    std::array<int, 2> min_pt;
+    std::array<int, 2> max_pt;
+};
+
+auto to_pod(distributed::shadow sh) {
+    return pod_shadow{math::vec_to_array<2_D, int>(sh.min_pt),
+                      math::vec_to_array<2_D, int>(sh.max_pt)};
+}
+
+auto from_pod(pod_shadow sh) {
+    return distributed::shadow{math::array_to_vec<2_D, int>(sh.min_pt),
+                               math::array_to_vec<2_D, int>(sh.max_pt)};
+}
+
 void communicate_overlaps(bulk::world& world,
                           projections<3_D, T>& local_proj_stack,
                           distributed::restricted_geometry<T>& local_geometry,
@@ -22,7 +37,7 @@ void communicate_overlaps(bulk::world& world,
     auto buffer = std::vector<T>();
     auto& proj_data = local_proj_stack.mutable_data();
 
-    auto q = bulk::queue<T[], int, distributed::shadow>(world);
+    auto q = bulk::queue<T[], int, pod_shadow>(world);
     for (auto ov : overlaps) {
         auto offset = local_proj_stack.offset(ov.projection);
         auto local_region = local_geometry.local_shadow(ov.projection);
@@ -40,12 +55,13 @@ void communicate_overlaps(bulk::world& world,
             }
         }
 
-        q(ov.target).send_many(buffer, ov.projection, ov.region);
+        q(ov.target).send(buffer, ov.projection, to_pod(ov.region));
     }
 
     world.sync();
 
-    for (auto && [ xs, proj, sh ] : q) {
+    for (auto && [ xs, proj, shpod ] : q) {
+        auto sh = from_pod(shpod);
         auto local_region = local_geometry.local_shadow(proj);
         auto w = distributed::shadow{sh.min_pt - local_region.min_pt,
                                      sh.max_pt - local_region.min_pt};
@@ -66,14 +82,9 @@ std::vector<overlap>
 compute_overlaps(bulk::world& world,
                  const distributed::restricted_geometry<T>& geometry) {
 
-    struct pod_shadow {
-        std::array<int, 2> min_pt;
-        std::array<int, 2> max_pt;
-    };
-
     auto overlaps = std::vector<overlap>();
 
-    auto s = world.processor_id();
+    auto s = world.rank();
     auto p = world.active_processors();
 
     // STEP 1: communicate shadows for all projections
@@ -82,8 +93,8 @@ compute_overlaps(bulk::world& world,
         bulk::coarray<pod_shadow>(world, p * geometry.projection_count());
 
     for (int i = 0; i < geometry.projection_count(); ++i) {
+        auto sh = geometry.local_shadow(i);
         for (int t = 0; t < p; ++t) {
-            auto sh = geometry.local_shadow(i);
             shadows(t)[p * i + s] = {math::vec_to_array<2_D, int>(sh.min_pt),
                                      math::vec_to_array<2_D, int>(sh.max_pt)};
         }
diff --git a/distributed/simple_distributed.cpp b/distributed/simple_distributed.cpp
index 91fc284..9190bff 100644
--- a/distributed/simple_distributed.cpp
+++ b/distributed/simple_distributed.cpp
@@ -1,3 +1,10 @@
+/*
+ * FIXME:
+ * - Can we group sends together? this should greatly reduce data size
+ *   with a constant factor.
+ */
+
+#include <cassert>
 #include <sstream>
 
 #include "bulk/backends/mpi/mpi.hpp"
@@ -40,7 +47,6 @@ auto calculate_local_volume(bulk::rectangular_partitioning<3, G>& partitioning,
 
 struct shared_pixel {
     int target;
-    int projection;
     int local_line;
     int remote_line;
 };
@@ -48,36 +54,126 @@ struct shared_pixel {
 std::pair<std::vector<shared_pixel>, std::vector<shared_pixel>>
 compute_contributions(bulk::world& world,
                       const distributed::restricted_geometry<T>& geometry) {
-    world.sync();
-    (void)geometry;
+    auto s = world.rank();
+    auto p = world.active_processors();
+    auto& gg = geometry.global_geometry();
+
+    // block_size is number of local projections
+    auto block_size = ((geometry.projection_count() - 1) / p) + 1;
+    auto shadows = bulk::coarray<pod_shadow>(world, p * block_size);
+    auto offsets = bulk::coarray<int>(world, p * block_size);
+
     // 1. send all `p` windows for all projections `i` to processor `i % p`
+    for (int i = 0; i < geometry.projection_count(); ++i) {
+        auto sh = geometry.local_shadow(i);
+        shadows(i % p)[(i / p) * p + s] = to_pod(sh);
+        offsets(i % p)[(i / p) * p + s] = geometry.offset(i);
+    }
+    world.sync();
+
     // 2. for each projection, for each pixel store the contributors
+    auto global_shape = gg.projection_shape(0);
+    auto pixel_count = math::product<2_D, int>(global_shape);
+    auto pixels = std::vector<std::vector<int>>(pixel_count * block_size);
+    for (int b = 0; b < block_size; ++b) {
+        for (int t = 0; t < p; ++t) {
+            auto w = shadows[b * p + t];
+            for (int j = w.min_pt[1]; j <= w.max_pt[1]; ++j) {
+                for (int i = w.min_pt[0]; i <= w.max_pt[0]; ++i) {
+                    if (j * global_shape[0] + i >= pixel_count) {
+                        std::cout << b << " [" << w.min_pt[0] << ", "
+                                  << w.max_pt[0] << "] -> " << i << " " << j
+                                  << " / " << pixel_count << " = "
+                                  << global_shape[0] << " x " << global_shape[1]
+                                  << "\n";
+                        assert(false);
+                    }
+                    pixels[b * pixel_count + j * global_shape[0] + i].push_back(
+                        t);
+                }
+            }
+        }
+    }
+
     // 3. decide who will be responsible for the pixel
     //    maybe (line_number % #contributors)
-    // 4. send to the owner the contributors, to the contributors the owner
+    auto local_idx = [](auto offset, auto lw, auto gw, auto idx) {
+        auto x = idx % gw[0] - lw.min_pt[0];
+        auto y = idx / gw[0] - lw.min_pt[1];
+        auto idx_in_proj = y * (lw.max_pt[0] - lw.min_pt[0] + 1) + x;
+        return offset + idx_in_proj;
+    };
+
+    auto my_contributions = bulk::queue<shared_pixel>(world);
+    auto my_responsibilities = bulk::queue<shared_pixel>(world);
+
+    for (int b = 0; b < block_size; ++b) {
+        for (int i = 0; i < pixel_count; ++i) {
+            auto pixel_idx = b * pixel_count + i;
+            if (pixels[pixel_idx].size() == 0) {
+                continue;
+            }
+            auto owner =
+                pixels[pixel_idx][pixel_idx % pixels[pixel_idx].size()];
+            // for contributors, we can compute the local index if we know the
+            // offset.. lets just send them... aaaaand got 'em..
+
+            auto owner_line =
+                local_idx(offsets[b * p + owner], shadows[b * p + owner],
+                          gg.projection_shape(0), i);
+
+            for (auto j = 0u; j < pixels[pixel_idx].size(); ++j) {
+                auto t = pixels[pixel_idx][j];
+                if (t == owner) {
+                    continue;
+                }
+
+                // 4. send to the owner the contributors, to the contributors
+                // the owner
+                auto contributor_line =
+                    local_idx(offsets[b * p + t], shadows[b * p + t],
+                              gg.projection_shape(0), i);
+                my_contributions(t).send({owner, contributor_line, owner_line});
+                my_responsibilities(owner).send(
+                    {t, owner_line, contributor_line});
+            }
+        }
+    }
+
+    world.sync();
+
     // 5. store contributors and owners in a vector
-    return {{}, {}};
+    auto contributions = std::vector<shared_pixel>(my_contributions.size());
+    auto responsibilities =
+        std::vector<shared_pixel>(my_responsibilities.size());
+    std::copy(my_contributions.begin(), my_contributions.end(),
+              contributions.begin());
+    std::copy(my_responsibilities.begin(), my_responsibilities.end(),
+              responsibilities.begin());
+
+    return {std::move(contributions), std::move(responsibilities)};
 }
 
 void communicate_contributions(bulk::world& world, projections<3_D, T>& projs,
                                const std::vector<shared_pixel>& contributions,
                                const std::vector<shared_pixel>& results) {
-    auto q = bulk::queue<int, int, T>(world);
+    auto q = bulk::queue<int, T>(world);
     auto share = [&](const auto& xs) {
-        for (auto[target, projection, local, remote] : xs) {
-            q(target).send(projection, remote,
-                           projs[projs.offset(projection) + local]);
+        for (auto [target, local, remote] : xs) {
+            q(target).send(remote, projs[local]);
         }
-        world.sync();
     };
 
     share(contributions);
-    for (auto[projection, idx, value] : q) {
-        projs[projs.offset(projection) + idx] += value;
+    world.sync();
+    for (auto [idx, value] : q) {
+        projs[idx] += value;
     }
+
     share(results);
-    for (auto[projection, idx, value] : q) {
-        projs[projs.offset(projection) + idx] = value;
+    world.sync();
+    for (auto [idx, value] : q) {
+        projs[idx] = value;
     }
 }
 
@@ -99,8 +195,8 @@ void run(util::args options) {
     //        {k, k, k}, {2, 1, 4});
 
     env.spawn(processors, [&](auto& world) {
-        auto s = world.processor_id();
-        auto vs = calculate_local_volume(partitioning, world.processor_id());
+        auto s = world.rank();
+        auto vs = calculate_local_volume(partitioning, world.rank());
 
         // Make scene with s = 0, connect to this scene from other procs
         util::ext_plotter<3_D, T> plotter("tcp://localhost:5555");
@@ -150,8 +246,7 @@ void run(util::args options) {
             }
         };
 
-        auto overlaps = compute_overlaps<T>(world, gs);
-        communicate_overlaps(world, p, gs, overlaps);
+        auto [go_forth, and_back] = compute_contributions(world, gs);
 
         // communicate row and column sums
         auto invert = [](auto x) { return (T)1.0 / x; };
@@ -160,7 +255,7 @@ void run(util::args options) {
         };
         auto r = projections<3_D, T>(gs);
         fp(image<3_D, T>(vs, (T)1.0), gs, vs, r);
-        communicate_overlaps(world, r, gs, overlaps);
+        communicate_contributions(world, r, go_forth, and_back);
         invert_all(r.mutable_data());
         auto c = image<3_D, T>(vs);
         bp(projections<3_D, T>(gs, (T)1.0), gs, vs, c);
@@ -175,7 +270,7 @@ void run(util::args options) {
             for (int l = 0; l < gs.lines(); ++l) {
                 q[l] = r[l] * (p[l] - q[l]);
             }
-            communicate_overlaps(world, q, gs, overlaps);
+            communicate_contributions(world, q, go_forth, and_back);
             bp(q, gs, vs, z);
             for (int i = 0; i < vs.cells(); ++i) {
                 x[i] += c[i] * z[i];
diff --git a/ext/bulk b/ext/bulk
index b774363..5c5ab58 160000
--- a/ext/bulk
+++ b/ext/bulk
@@ -1 +1 @@
-Subproject commit b774363e4894dacb810ba1e04e7e67d405841d45
+Subproject commit 5c5ab584e513147af994896cb5f9a5ad1e9c27ba
diff --git a/ext/catch b/ext/catch
index 8ebe94c..d5613fb 160000
--- a/ext/catch
+++ b/ext/catch
@@ -1 +1 @@
-Subproject commit 8ebe94ca2ea1e6ae7765f65d2f3aec08c9a85ae2
+Subproject commit d5613fb18a6a2bc54ac5a4ce50f3e36ea2560166
diff --git a/ext/cpptoml b/ext/cpptoml
index 918f7f8..789b388 160000
--- a/ext/cpptoml
+++ b/ext/cpptoml
@@ -1 +1 @@
-Subproject commit 918f7f835bbff2602ef9a93a1085ed9930ac6330
+Subproject commit 789b3881992c6775b902aa096482850a683aa9d5
diff --git a/ext/cppzmq b/ext/cppzmq
index b0e6d4b..d9f0f01 160000
--- a/ext/cppzmq
+++ b/ext/cppzmq
@@ -1 +1 @@
-Subproject commit b0e6d4bacd176aec2c41b54d46b70a4682bb88d7
+Subproject commit d9f0f016c07046742738c65e1eb84722ae32d7d4
diff --git a/ext/fmt b/ext/fmt
index 3028344..6655e80 160000
--- a/ext/fmt
+++ b/ext/fmt
@@ -1 +1 @@
-Subproject commit 30283443805745af3761be1682e6849766246e76
+Subproject commit 6655e804c49f66b6c978fcdd01ef6b03b55208fa
diff --git a/ext/libzmq b/ext/libzmq
index 30ab0ed..834b9e4 160000
--- a/ext/libzmq
+++ b/ext/libzmq
@@ -1 +1 @@
-Subproject commit 30ab0ed897ae1d8ed82da89543f8bf5856a902e7
+Subproject commit 834b9e4cd7ff05918c336df8b2a858a5efe4d7ec
diff --git a/include/tomos/distributed/recursive_bisectioning.hpp b/include/tomos/distributed/recursive_bisectioning.hpp
index c944359..ceae29b 100644
--- a/include/tomos/distributed/recursive_bisectioning.hpp
+++ b/include/tomos/distributed/recursive_bisectioning.hpp
@@ -55,12 +55,13 @@ split_t find_split(const std::vector<math::line<D, T>>& lines,
     std::size_t idx = 0;
     for (auto line : lines) {
         auto intersections = math::intersect_bounds<D, T>(line, bounds);
-        // FIXME: check if intersects, since line may start inside bounds
         if (intersections) {
             auto a = intersections.value().first;
             auto b = intersections.value().second;
             crossings.push_back({a, idx, math::sign<D, T>(b - a)});
             crossings.push_back({b, idx, math::sign<D, T>(a - b)});
+        } else {
+            std::cout << "Line that should intersect does indeed not\n";
         }
         ++idx;
     }
@@ -190,11 +191,8 @@ partition_bisection(const tomo::geometry::base<D, T>& geometry,
     assert(1 << depth == processors);
 
     // containers for the current left and right lines
-    // TODO: needed here?
     std::vector<math::line<D, T>> all_lines;
     for (auto l : geometry) {
-        // FIXME actually; we need intersections here, not the truncation
-        // because it does a 'discrete intersection'
         auto line = math::truncate_to_volume(l, object_volume);
         if (line) {
             all_lines.push_back(line.value());
diff --git a/include/tomos/geometry.hpp b/include/tomos/geometry.hpp
index dd0a9bb..663765c 100644
--- a/include/tomos/geometry.hpp
+++ b/include/tomos/geometry.hpp
@@ -190,8 +190,7 @@ class base {
 
     /** Construct the geometry with a given number of lines. */
     base(int projection_count, bool parallel = false)
-        : projection_count_(projection_count), parallel_(parallel) {
-    }
+        : projection_count_(projection_count), parallel_(parallel) {}
 
     virtual ~base() = default;
 
@@ -217,12 +216,27 @@ class base {
 
     virtual std::array<math::vec<D, T>, D - 1>
     projection_delta(int i) const = 0;
+    int offset(int idx) const { return offsets_[idx]; }
 
   protected:
+    std::vector<int> offsets_;
+
+    void compute_offsets_() {
+        offsets_.resize(projection_count());
+        offsets_[0] = 0;
+        std::iota(offsets_.begin() + 1, offsets_.end(), 0);
+        std::transform(
+            offsets_.begin() + 1, offsets_.end(), offsets_.begin() + 1,
+            [&](int i) { return math::reduce<D - 1>(projection_shape(i)); });
+        std::partial_sum(offsets_.begin() + 1, offsets_.end(),
+                         offsets_.begin() + 1);
+    }
+
     void compute_lines_() {
         for (auto i = 0; i < projection_count_; ++i) {
             line_count_ += math::reduce<D - 1>(this->projection_shape(i));
         }
+        this->compute_offsets_();
     }
 
     int projection_count_ = 0;
diff --git a/include/tomos/math.hpp b/include/tomos/math.hpp
index 70c9024..2894976 100644
--- a/include/tomos/math.hpp
+++ b/include/tomos/math.hpp
@@ -322,7 +322,8 @@ intersect_bounds(line<D, T> l, std::array<vec2<int>, D> bounds) {
     auto for_sure_far_enough = product<D, T>(bounds_sides);
 
     auto result = aabb_intersection<D, T>(
-        l.origin, for_sure_far_enough * l.delta, bounds_sides, bounds_origin);
+        l.origin - for_sure_far_enough * l.delta, for_sure_far_enough * l.delta,
+        bounds_sides, bounds_origin);
 
     if (result) {
         return std::pair<vec<D, T>, vec<D, T>>{result.value().first,
diff --git a/include/tomos/projections.hpp b/include/tomos/projections.hpp
index 2673b92..8fe3bc4 100644
--- a/include/tomos/projections.hpp
+++ b/include/tomos/projections.hpp
@@ -22,15 +22,11 @@ class projections {
 
     /** Construct default-initialized projections for a geometry. */
     projections(const geometry::base<D, T>& geometry)
-        : geometry_(geometry), data_(geometry.lines()) {
-        compute_offsets_();
-    }
+        : geometry_(geometry), data_(geometry.lines()) {}
 
     /** Construct projections for a geometry with constant value. */
     projections(const geometry::base<D, T>& geometry, T value)
-        : geometry_(geometry), data_(geometry.lines(), value) {
-        compute_offsets_();
-    }
+        : geometry_(geometry), data_(geometry.lines(), value) {}
 
     /**
      * Obtain a reference to the i-th measurement, corresponding to the i-th
@@ -51,7 +47,7 @@ class projections {
 
     ///** Get a projection as an image */
     image<D - 1, T> get_projection(int idx) const {
-        auto offset = offsets_[idx];
+        auto offset = this->offset(idx);
         auto shape = geometry_.projection_shape(idx);
         auto img = image<D - 1, T>(volume<D - 1, T>(shape));
         for (int i = 0; i < math::reduce<D - 1>(shape); ++i) {
@@ -62,30 +58,18 @@ class projections {
 
     /** Set a projection using an image */
     void set_projection(int idx, const image<D - 1, T>& img) {
-        auto offset = offsets_[idx];
+        auto offset = this->offset(idx);
         auto pixels = math::reduce<D - 1>(geometry_.projection_shape(idx));
         for (int i = 0; i < pixels; ++i) {
             data_[offset + i] = img[i];
         }
     }
 
-    int offset(int idx) const { return offsets_[idx]; }
+    int offset(int idx) const { return geometry_.offset(idx); }
 
-  private:
-    void compute_offsets_() {
-        offsets_.resize(geometry_.projection_count());
-        offsets_[0] = 0;
-        std::iota(offsets_.begin() + 1, offsets_.end(), 0);
-        std::transform(offsets_.begin() + 1, offsets_.end(),
-                       offsets_.begin() + 1, [&](int i) {
-                           return math::reduce<D - 1>(
-                               geometry_.projection_shape(i));
-                       });
-        std::partial_sum(offsets_.begin() + 1, offsets_.end(),
-                         offsets_.begin() + 1);
-    }
+    auto size() const { return geometry_.lines(); }
 
-    std::vector<int> offsets_;
+  private:
     const geometry::base<D, T>& geometry_;
     int projection_count_;
     std::vector<T> data_;
diff --git a/python/build/pybind11 b/python/build/pybind11
index 6b51619..e06077b 160000
--- a/python/build/pybind11
+++ b/python/build/pybind11
@@ -1 +1 @@
-Subproject commit 6b51619a7cc5a34bbcd4474fb70954d009cebb64
+Subproject commit e06077bf4718050c4f95c3b4741c75f7f7041b83