diff --git a/Makefile b/Makefile
index 7b082a0a1a10..32092e9bd8f6 100644
--- a/Makefile
+++ b/Makefile
@@ -321,7 +321,6 @@ SOURCE_FILES = \
   Generator.cpp \
   HexagonOffload.cpp \
   HexagonOptimize.cpp \
-  Image.cpp \
   ImageParam.cpp \
   Interval.cpp \
   InjectHostDevBufferCopies.cpp \
@@ -458,7 +457,7 @@ HEADER_FILES = \
   HexagonOffload.h \
   HexagonOptimize.h \
   runtime/HalideRuntime.h \
-  Image.h \
+  runtime/HalideImage.h \
   ImageParam.h \
   Interval.h \
   InjectHostDevBufferCopies.h \
@@ -1068,7 +1067,7 @@ performance_%: $(BIN_DIR)/performance_%
 
 error_%: $(BIN_DIR)/error_%
 	@-mkdir -p $(TMP_DIR)
-	cd $(TMP_DIR) ; $(CURDIR)/$< 2>&1 | egrep --q "terminating with uncaught exception|^terminate called|^Error"
+	cd $(TMP_DIR) ; $(CURDIR)/$< 2>&1 | egrep --q "terminating with uncaught exception|^terminate called|^Error|Assertion.*failed"
 	@-echo
 
 warning_%: $(BIN_DIR)/warning_%
@@ -1251,6 +1250,7 @@ install: $(LIB_DIR)/libHalide.a $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR
 	mkdir -p $(PREFIX)/include $(PREFIX)/bin $(PREFIX)/lib $(PREFIX)/share/halide/tutorial/images $(PREFIX)/share/halide/tools $(PREFIX)/share/halide/tutorial/figures
 	cp $(LIB_DIR)/libHalide.a $(BIN_DIR)/libHalide.$(SHARED_EXT) $(PREFIX)/lib
 	cp $(INCLUDE_DIR)/Halide.h $(PREFIX)/include
+	cp $(INCLUDE_DIR)/HalideImage.h $(PREFIX)/include
 	cp $(INCLUDE_DIR)/HalideRuntim*.h $(PREFIX)/include
 	cp $(ROOT_DIR)/tutorial/images/*.png $(PREFIX)/share/halide/tutorial/images
 	cp $(ROOT_DIR)/tutorial/figures/*.gif $(PREFIX)/share/halide/tutorial/figures
@@ -1261,7 +1261,6 @@ install: $(LIB_DIR)/libHalide.a $(BIN_DIR)/libHalide.$(SHARED_EXT) $(INCLUDE_DIR
 	cp $(ROOT_DIR)/tutorial/*.sh $(PREFIX)/share/halide/tutorial
 	cp $(ROOT_DIR)/tools/mex_halide.m $(PREFIX)/share/halide/tools
 	cp $(ROOT_DIR)/tools/GenGen.cpp $(PREFIX)/share/halide/tools
-	cp $(ROOT_DIR)/tools/halide_image.h $(PREFIX)/share/halide/tools
 	cp $(ROOT_DIR)/tools/halide_image_io.h $(PREFIX)/share/halide/tools
 	cp $(ROOT_DIR)/tools/halide_image_info.h $(PREFIX)/share/halide/tools
 
@@ -1270,6 +1269,7 @@ $(DISTRIB_DIR)/halide.tgz: $(LIB_DIR)/libHalide.a $(BIN_DIR)/libHalide.$(SHARED_
 	cp $(BIN_DIR)/libHalide.$(SHARED_EXT) $(DISTRIB_DIR)/bin
 	cp $(LIB_DIR)/libHalide.a $(DISTRIB_DIR)/lib
 	cp $(INCLUDE_DIR)/Halide.h $(DISTRIB_DIR)/include
+	cp $(INCLUDE_DIR)/HalideImage.h $(DISTRIB_DIR)/include
 	cp $(INCLUDE_DIR)/HalideRuntim*.h $(DISTRIB_DIR)/include
 	cp $(ROOT_DIR)/tutorial/images/*.png $(DISTRIB_DIR)/tutorial/images
 	cp $(ROOT_DIR)/tutorial/figures/*.gif $(DISTRIB_DIR)/tutorial/figures
@@ -1280,12 +1280,11 @@ $(DISTRIB_DIR)/halide.tgz: $(LIB_DIR)/libHalide.a $(BIN_DIR)/libHalide.$(SHARED_
 	cp $(ROOT_DIR)/tutorial/*.sh $(DISTRIB_DIR)/tutorial
 	cp $(ROOT_DIR)/tools/mex_halide.m $(DISTRIB_DIR)/tools
 	cp $(ROOT_DIR)/tools/GenGen.cpp $(DISTRIB_DIR)/tools
-	cp $(ROOT_DIR)/tools/halide_image.h $(DISTRIB_DIR)/tools
 	cp $(ROOT_DIR)/tools/halide_image_io.h $(DISTRIB_DIR)/tools
 	cp $(ROOT_DIR)/tools/halide_image_info.h $(DISTRIB_DIR)/tools
 	cp $(ROOT_DIR)/README.md $(DISTRIB_DIR)
 	ln -sf $(DISTRIB_DIR) halide
-	tar -czf $(DISTRIB_DIR)/halide.tgz halide/bin halide/lib halide/include halide/tutorial halide/README.md halide/tools/mex_halide.m halide/tools/GenGen.cpp halide/tools/halide_image.h halide/tools/halide_image_io.h halide/tools/halide_image_info.h
+	tar -czf $(DISTRIB_DIR)/halide.tgz halide/bin halide/lib halide/include halide/tutorial halide/README.md halide/tools/mex_halide.m halide/tools/GenGen.cpp halide/tools/halide_image_io.h halide/tools/halide_image_info.h
 	rm -rf halide
 
 .PHONY: distrib
diff --git a/apps/bilateral_grid/filter.cpp b/apps/bilateral_grid/filter.cpp
index 44d38af9aa84..88d3e9386f8e 100644
--- a/apps/bilateral_grid/filter.cpp
+++ b/apps/bilateral_grid/filter.cpp
@@ -5,10 +5,10 @@
 #include "bilateral_grid.h"
 
 #include "benchmark.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 #include "halide_image_io.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 int main(int argc, char **argv) {
 
diff --git a/apps/blur/test.cpp b/apps/blur/test.cpp
index 982b16612b25..2f561bee360f 100644
--- a/apps/blur/test.cpp
+++ b/apps/blur/test.cpp
@@ -4,9 +4,9 @@
 #include <cstdio>
 
 #include "benchmark.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 //#define cimg_display 0
 //#include "CImg.h"
diff --git a/apps/c_backend/run.cpp b/apps/c_backend/run.cpp
index c3dead9c22fa..d042d54acc1b 100644
--- a/apps/c_backend/run.cpp
+++ b/apps/c_backend/run.cpp
@@ -2,11 +2,11 @@
 #include <cstdio>
 #include <cstdlib>
 
-#include "halide_image.h"
+#include "HalideImage.h"
 #include "pipeline_c.h"
 #include "pipeline_native.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 extern "C" int an_extern_func(int x, int y) {
     return x + y;
diff --git a/apps/c_backend/run_cpp.cpp b/apps/c_backend/run_cpp.cpp
index bd38e827d90d..85fd38ea5a25 100644
--- a/apps/c_backend/run_cpp.cpp
+++ b/apps/c_backend/run_cpp.cpp
@@ -2,11 +2,11 @@
 #include <cstdio>
 #include <cstdlib>
 
-#include "halide_image.h"
+#include "HalideImage.h"
 #include "pipeline_cpp_native.h"
 #include "pipeline_cpp_cpp.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 extern "C" int an_extern_c_func(int a1, float a2) {
     return (int)(a1 + a2);
diff --git a/apps/camera_pipe/fcam/Demosaic.h b/apps/camera_pipe/fcam/Demosaic.h
index 06068b4af98a..fd5ce1dac6fc 100644
--- a/apps/camera_pipe/fcam/Demosaic.h
+++ b/apps/camera_pipe/fcam/Demosaic.h
@@ -3,7 +3,7 @@
 
 /** \file
  * Converting RAW data to RGB24 by demosiacking and gamma correcting. */
-#include "halide_image.h"
+#include "HalideImage.h"
 
 namespace FCam {
 
diff --git a/apps/camera_pipe/fcam/Demosaic_ARM.h b/apps/camera_pipe/fcam/Demosaic_ARM.h
index e51e9093984b..8b901bc1e6e3 100644
--- a/apps/camera_pipe/fcam/Demosaic_ARM.h
+++ b/apps/camera_pipe/fcam/Demosaic_ARM.h
@@ -2,7 +2,7 @@
 #define FCAM_DEMOSAIC_ARM_H
 //#ifdef FCAM_ARCH_ARM
 
-#include "halide_image.h"
+#include "HalideImage.h"
 
 // Arm-specific optimized post-processing routines
 
diff --git a/apps/camera_pipe/process.cpp b/apps/camera_pipe/process.cpp
index efae43b3cb9a..f7fb68ba5b16 100644
--- a/apps/camera_pipe/process.cpp
+++ b/apps/camera_pipe/process.cpp
@@ -3,7 +3,7 @@
 
 #include "benchmark.h"
 #include "curved.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 #include "halide_image_io.h"
 #include "halide_malloc_trace.h"
 
@@ -12,7 +12,7 @@
 #include <cstdlib>
 #include <cassert>
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 int main(int argc, char **argv) {
     if (argc < 7) {
diff --git a/apps/interpolate/interpolate.cpp b/apps/interpolate/interpolate.cpp
index c855e2997845..349c22668fa1 100644
--- a/apps/interpolate/interpolate.cpp
+++ b/apps/interpolate/interpolate.cpp
@@ -8,7 +8,7 @@ using namespace Halide;
 #include "benchmark.h"
 #include "halide_image_io.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 using std::vector;
 
diff --git a/apps/local_laplacian/process.cpp b/apps/local_laplacian/process.cpp
index 982bd28d9cb4..f1b2d2ec6e1a 100644
--- a/apps/local_laplacian/process.cpp
+++ b/apps/local_laplacian/process.cpp
@@ -4,10 +4,10 @@
 #include "local_laplacian.h"
 
 #include "benchmark.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 #include "halide_image_io.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 int main(int argc, char **argv) {
     if (argc < 7) {
diff --git a/apps/modules/run_pipeline.cpp b/apps/modules/run_pipeline.cpp
index 17bdbeb98153..6c164f8b501a 100644
--- a/apps/modules/run_pipeline.cpp
+++ b/apps/modules/run_pipeline.cpp
@@ -1,9 +1,9 @@
 #include "pipeline.h"
 
-#include "halide_image.h"
+#include "HalideImage.h"
 #include "halide_image_io.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 int main(int argc, char **argv) {
 
diff --git a/apps/templates/tests/example_test.cpp b/apps/templates/tests/example_test.cpp
index 0b302c0801c9..d5351b83e815 100644
--- a/apps/templates/tests/example_test.cpp
+++ b/apps/templates/tests/example_test.cpp
@@ -5,7 +5,7 @@
 #include "HalideRuntimeOpenGL.h"
 #include "SimpleAppAPI.h"
 
-#include "halide_image.h"
+#include "HalideImage.h"
 
 #include "example4.h"
 #include "example4_glsl.h"
diff --git a/apps/wavelet/wavelet.cpp b/apps/wavelet/wavelet.cpp
index 7ef331c967cd..442b2391ee60 100644
--- a/apps/wavelet/wavelet.cpp
+++ b/apps/wavelet/wavelet.cpp
@@ -5,10 +5,10 @@
 #include "daubechies_x.h"
 #include "inverse_daubechies_x.h"
 
-#include "halide_image.h"
+#include "HalideImage.h"
 #include "halide_image_io.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 namespace {
 
diff --git a/src/AddImageChecks.cpp b/src/AddImageChecks.cpp
index 01ddc88ceef9..9a3e2e0e37a2 100644
--- a/src/AddImageChecks.cpp
+++ b/src/AddImageChecks.cpp
@@ -387,7 +387,7 @@ Stmt add_image_checks(Stmt s,
 
                     stride_constrained = param.stride_constraint(i);
                 } else if (image.defined() && (int)i < image.dimensions()) {
-                    stride_constrained = image.stride(i);
+                    stride_constrained = image.dim(i).stride();
                 }
 
                 std::string min0_name = buffer_name + ".0.min." + dim;
@@ -404,9 +404,9 @@ Stmt add_image_checks(Stmt s,
                     extent_constrained = Variable::make(Int(32), extent0_name);
                 }
             } else if (image.defined() && (int)i < image.dimensions()) {
-                stride_constrained = image.stride(i);
-                extent_constrained = image.extent(i);
-                min_constrained = image.min(i);
+                stride_constrained = image.dim(i).stride();
+                extent_constrained = image.dim(i).extent();
+                min_constrained = image.dim(i).min();
             } else if (param.defined()) {
                 stride_constrained = param.stride_constraint(i);
                 extent_constrained = param.extent_constraint(i);
diff --git a/src/Argument.h b/src/Argument.h
index 79ae0f657ced..eb81b701af69 100644
--- a/src/Argument.h
+++ b/src/Argument.h
@@ -72,6 +72,9 @@ struct Argument {
             << "Scalar max must not be defined for Buffer Arguments";
     }
 
+    template<typename T, int D>
+    Argument(const Image<T, D> &im) : kind(InputBuffer), dimensions(im.dimensions()), type(im.type()) {}
+
     bool is_buffer() const { return kind == InputBuffer || kind == OutputBuffer; }
     bool is_scalar() const { return kind == InputScalar; }
 
diff --git a/src/BoundaryConditions.h b/src/BoundaryConditions.h
index 6e262e02283c..00114224f32a 100644
--- a/src/BoundaryConditions.h
+++ b/src/BoundaryConditions.h
@@ -99,7 +99,7 @@ template <typename T>
 inline NO_INLINE Func constant_exterior(T func_like, Tuple value) {
     std::vector<std::pair<Expr, Expr>> object_bounds;
     for (int i = 0; i < func_like.dimensions(); i++) {
-        object_bounds.push_back(std::make_pair(Expr(func_like.min(i)), Expr(func_like.extent(i))));
+        object_bounds.push_back(std::make_pair(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent())));
     }
 
     return constant_exterior(Internal::func_like_to_func(func_like), value, object_bounds);
@@ -143,7 +143,7 @@ template <typename T>
 inline NO_INLINE Func repeat_edge(T func_like) {
     std::vector<std::pair<Expr, Expr>> object_bounds;
     for (int i = 0; i < func_like.dimensions(); i++) {
-        object_bounds.push_back(std::make_pair(Expr(func_like.min(i)), Expr(func_like.extent(i))));
+        object_bounds.push_back(std::make_pair(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent())));
     }
 
     return repeat_edge(Internal::func_like_to_func(func_like), object_bounds);
@@ -178,7 +178,7 @@ template <typename T>
 inline NO_INLINE Func repeat_image(T func_like) {
     std::vector<std::pair<Expr, Expr>> object_bounds;
     for (int i = 0; i < func_like.dimensions(); i++) {
-        object_bounds.push_back(std::make_pair(Expr(func_like.min(i)), Expr(func_like.extent(i))));
+        object_bounds.push_back(std::make_pair(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent())));
     }
 
     return repeat_image(Internal::func_like_to_func(func_like), object_bounds);
@@ -212,7 +212,7 @@ template <typename T>
 inline NO_INLINE Func mirror_image(T func_like) {
     std::vector<std::pair<Expr, Expr>> object_bounds;
     for (int i = 0; i < func_like.dimensions(); i++) {
-        object_bounds.push_back(std::make_pair(Expr(func_like.min(i)), Expr(func_like.extent(i))));
+        object_bounds.push_back(std::make_pair(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent())));
     }
 
     return mirror_image(Internal::func_like_to_func(func_like), object_bounds);
@@ -249,7 +249,7 @@ template <typename T>
 inline NO_INLINE Func mirror_interior(T func_like) {
     std::vector<std::pair<Expr, Expr>> object_bounds;
     for (int i = 0; i < func_like.dimensions(); i++) {
-        object_bounds.push_back(std::make_pair(Expr(func_like.min(i)), Expr(func_like.extent(i))));
+        object_bounds.push_back(std::make_pair(Expr(func_like.dim(i).min()), Expr(func_like.dim(i).extent())));
     }
 
     return mirror_interior(Internal::func_like_to_func(func_like), object_bounds);
diff --git a/src/Bounds.cpp b/src/Bounds.cpp
index 7fd88275de23..4edc01a284fd 100644
--- a/src/Bounds.cpp
+++ b/src/Bounds.cpp
@@ -1527,12 +1527,13 @@ void bounds_test() {
     vector<Expr> input_site_2 = {2*x+1};
     vector<Expr> output_site = {x+1};
 
-    Buffer in(Int(32), {10}, nullptr, "input");
+    Image<int32_t, 1> in(10);
+    Buffer in_buf(in, "input");
 
     Stmt loop = For::make("x", 3, 10, ForType::Serial, DeviceAPI::Host,
                           Provide::make("output",
-                                        {Add::make(Call::make(in, input_site_1),
-                                                   Call::make(in, input_site_2))},
+                                        {Add::make(Call::make(in_buf, input_site_1),
+                                                   Call::make(in_buf, input_site_2))},
                                         output_site));
 
     map<string, Box> r;
diff --git a/src/Buffer.cpp b/src/Buffer.cpp
index 06ea304a3c15..0230e2f9ba86 100644
--- a/src/Buffer.cpp
+++ b/src/Buffer.cpp
@@ -4,101 +4,17 @@
 #include "JITModule.h"
 #include "runtime/HalideRuntime.h"
 #include "Target.h"
+#include "Var.h"
+#include "IREquality.h"
+#include "IROperator.h"
 
 namespace Halide {
 namespace Internal {
 
-namespace {
-
-uint64_t multiply_buffer_size_check_overflow(uint64_t size, uint64_t factor, const std::string &name) {
-    // Ignore the dimensions for which the extent is zero.
-    if (!factor) return size;
-
-    // Multiply and check for 64-bit overflow
-    uint64_t result = size * factor;
-    bool overflow = (result / factor) != size;
-
-    // Check against the limits Halide internally assumes in its compiled code.
-    overflow |= (sizeof(size_t) == 4) && ((result >> 31) != 0);
-
-    // In 64-bit with LargeBuffers *not* set, the limit above is the
-    // correct one, however at Buffer creation time we don't know what
-    // pipelines it will be used in, so we must be conservative and
-    // defer the error until the user actually passes the buffer into
-    // a pipeline they shouldn't have.
-    overflow |= (sizeof(size_t) == 8) && ((result >> 63) != 0);
-
-    // Assert there was no overflow.
-    user_assert(!overflow)
-        << "Total size of buffer " << name << " exceeds 2^" << ((sizeof(size_t) * 8) - 1) << " - 1\n";
-    return result;
-}
-
-}
-
 struct BufferContents {
-    /** The buffer_t object we're wrapping. */
-    buffer_t buf;
-
-    /** The type of the allocation. buffer_t's don't currently track this so we do it here. */
-    Type type;
-
-    /** If we made the allocation ourselves via a Buffer constructor,
-     * and hence should delete it when this buffer dies, then this
-     * pointer is set to the memory we need to free. Otherwise it's
-     * nullptr. */
-    uint8_t *allocation;
-
-    /** How many Buffer objects point to this BufferContents */
-    mutable RefCount ref_count;
-
-    /** What is the name of the buffer? Useful for debugging symbols. */
+    Image<void> image;
     std::string name;
-
-    BufferContents(Type t, int x_size, int y_size, int z_size, int w_size,
-                   uint8_t* data, const std::string &n) :
-        type(t), allocation(nullptr), name(n.empty() ? unique_name('b') : n) {
-        user_assert(t.lanes() == 1) << "Can't create of a buffer of a vector type";
-        buf.elem_size = t.bytes();
-        uint64_t size = 1;
-        size = multiply_buffer_size_check_overflow(size, x_size, name);
-        size = multiply_buffer_size_check_overflow(size, y_size, name);
-        size = multiply_buffer_size_check_overflow(size, z_size, name);
-        size = multiply_buffer_size_check_overflow(size, w_size, name);
-        size = multiply_buffer_size_check_overflow(size, buf.elem_size, name);
-
-        if (!data) {
-            // There's no way for this to overflow without the buffer already being > 2^63-1
-            size += 32;
-            allocation = (uint8_t *)calloc(1, (size_t)size);
-            user_assert(allocation) << "Out of memory allocating buffer " << name << " of size " << size << "\n";
-            buf.host = allocation;
-            while ((size_t)(buf.host) & 0x1f) buf.host++;
-        } else {
-            buf.host = data;
-        }
-        buf.dev = 0;
-        buf.host_dirty = false;
-        buf.dev_dirty = false;
-        buf.extent[0] = x_size;
-        buf.extent[1] = y_size;
-        buf.extent[2] = z_size;
-        buf.extent[3] = w_size;
-        buf.stride[0] = 1;
-        buf.stride[1] = x_size;
-        buf.stride[2] = x_size*y_size;
-        buf.stride[3] = x_size*y_size*z_size;
-        buf.min[0] = 0;
-        buf.min[1] = 0;
-        buf.min[2] = 0;
-        buf.min[3] = 0;
-    }
-
-    BufferContents(Type t, const buffer_t *b, const std::string &n) :
-        type(t), allocation(nullptr), name(n.empty() ? unique_name('b') : n) {
-        buf = *b;
-        user_assert(t.lanes() == 1) << "Can't create of a buffer of a vector type";
-    }
+    mutable RefCount ref_count;
 };
 
 template<>
@@ -108,156 +24,96 @@ EXPORT RefCount &ref_count<BufferContents>(const BufferContents *p) {
 
 template<>
 EXPORT void destroy<BufferContents>(const BufferContents *p) {
-    // Ignore errors. We may be cleaning up a buffer after an earlier
-    // error, and asserting would re-raise it.
-    halide_device_free(nullptr, const_cast<buffer_t *>(&p->buf));
-    free(p->allocation);
     delete p;
 }
-
 }
 
 namespace {
-int32_t size_or_zero(const std::vector<int32_t> &sizes, size_t index) {
-    return (index < sizes.size()) ? sizes[index] : 0;
-}
-
 std::string make_buffer_name(const std::string &n, Buffer *b) {
     if (n.empty()) {
-        return Internal::make_entity_name(b, "Halide::Buffer", 'b');
+        return Internal::make_entity_name(b, "Halide::Internal::Buffer", 'b');
     } else {
         return n;
     }
 }
 }
 
-Buffer::Buffer(Type t, int x_size, int y_size, int z_size, int w_size,
-               uint8_t* data, const std::string &name) :
-    contents(new Internal::BufferContents(t, x_size, y_size, z_size, w_size, data,
-                                          make_buffer_name(name, this))) {
-}
-
-Buffer::Buffer(Type t, const std::vector<int32_t> &sizes,
-               uint8_t* data, const std::string &name) :
-    contents(new Internal::BufferContents(t,
-                                          size_or_zero(sizes, 0),
-                                          size_or_zero(sizes, 1),
-                                          size_or_zero(sizes, 2),
-                                          size_or_zero(sizes, 3),
-                                          data,
-                                          make_buffer_name(name, this))) {
-    user_assert(sizes.size() <= 4) << "Buffer dimensions greater than 4 are not supported.";
-}
-
-Buffer::Buffer(Type t, const buffer_t *buf, const std::string &name) :
-    contents(new Internal::BufferContents(t, buf,
-                                          make_buffer_name(name, this))) {
-}
-
-void *Buffer::host_ptr() const {
-    user_assert(defined()) << "Buffer is undefined\n";
-    return (void *)contents->buf.host;
-}
-
-buffer_t *Buffer::raw_buffer() const {
-    user_assert(defined()) << "Buffer is undefined\n";
-    return &(contents->buf);
-}
-
-uint64_t Buffer::device_handle() const {
-    user_assert(defined()) << "Buffer is undefined\n";
-    return contents->buf.dev;
-}
+Buffer::Buffer(const Image<void> &buf, std::string name) :
+    contents(new Internal::BufferContents {Image<void>(buf), make_buffer_name(name, this)}) {}
 
-bool Buffer::host_dirty() const {
-    user_assert(defined()) << "Buffer is undefined\n";
-    return contents->buf.host_dirty;
-}
+Buffer::Buffer(Type t, const buffer_t &buf, std::string name) :
+    contents(new Internal::BufferContents {Image<void>(t, buf), make_buffer_name(name, this)}) {}
 
-void Buffer::set_host_dirty(bool dirty) {
-    user_assert(defined()) << "Buffer is undefined\n";
-    contents->buf.host_dirty = dirty;
-}
-
-bool Buffer::device_dirty() const {
-    user_assert(defined()) << "Buffer is undefined\n";
-    return contents->buf.dev_dirty;
-}
+Buffer::Buffer(Type t, const std::vector<int> &size, std::string name) :
+    contents(new Internal::BufferContents {Image<void>(t, size), make_buffer_name(name, this)}) {}
 
-void Buffer::set_device_dirty(bool dirty) {
-    user_assert(defined()) << "Buffer is undefined\n";
-    contents->buf.dev_dirty = dirty;
+bool Buffer::same_as(const Buffer &other) const {
+    return contents.same_as(other.contents);
 }
 
-int Buffer::dimensions() const {
-    for (int i = 0; i < 4; i++) {
-        if (extent(i) == 0) return i;
-    }
-    return 4;
+Image<void> &Buffer::get() {
+    return contents->image;
 }
 
-int Buffer::extent(int dim) const {
-    user_assert(defined()) << "Buffer is undefined\n";
-    user_assert(dim >= 0 && dim < 4) << "We only support 4-dimensional buffers for now";
-    return contents->buf.extent[dim];
+const Image<void> &Buffer::get() const {
+    return contents->image;
 }
 
-int Buffer::stride(int dim) const {
-    user_assert(defined());
-    user_assert(dim >= 0 && dim < 4) << "We only support 4-dimensional buffers for now";
-    return contents->buf.stride[dim];
+bool Buffer::defined() const {
+    return contents->image;
 }
 
-int Buffer::min(int dim) const {
-    user_assert(defined()) << "Buffer is undefined\n";
-    user_assert(dim >= 0 && dim < 4) << "We only support 4-dimensional buffers for now";
-    return contents->buf.min[dim];
+const std::string &Buffer::name() const {
+    return contents->name;
 }
 
-void Buffer::set_min(int m0, int m1, int m2, int m3) {
-    user_assert(defined()) << "Buffer is undefined\n";
-    contents->buf.min[0] = m0;
-    contents->buf.min[1] = m1;
-    contents->buf.min[2] = m2;
-    contents->buf.min[3] = m3;
+Buffer::operator Argument() const {
+    return Argument(name(), Argument::InputBuffer, type(), dimensions());
 }
 
 Type Buffer::type() const {
-    user_assert(defined()) << "Buffer is undefined\n";
-    return contents->type;
+    return contents->image.type();
 }
 
-bool Buffer::same_as(const Buffer &other) const {
-    return contents.same_as(other.contents);
-}
-
-bool Buffer::defined() const {
-    return contents.defined();
-}
-
-const std::string &Buffer::name() const {
-    return contents->name;
+int Buffer::dimensions() const {
+    return contents->image.dimensions();
 }
 
-Buffer::operator Argument() const {
-    return Argument(name(), Argument::InputBuffer, type(), dimensions());
+Image<void>::Dimension Buffer::dim(int i) const {
+    return contents->image.dim(i);
 }
 
-int Buffer::copy_to_host() {
-    return halide_copy_to_host(nullptr, raw_buffer());
+buffer_t *Buffer::raw_buffer() const {
+    return contents->image.raw_buffer();
 }
 
-int Buffer::device_sync() {
-    return halide_device_sync(nullptr, raw_buffer());
+size_t Buffer::size_in_bytes() const {
+    return contents->image.size_in_bytes();
 }
 
-int Buffer::copy_to_device() {
-  return halide_copy_to_device(nullptr, raw_buffer(), nullptr);
+uint8_t *Buffer::host_ptr() const {
+    return raw_buffer()->host;
 }
 
-int Buffer::free_dev_buffer() {
-    return halide_device_free(nullptr, raw_buffer());
+Expr Buffer::operator()(const std::vector<Expr> &args) const {
+    // Cast the inputs to int32
+    std::vector<Expr> int_args;
+    for (Expr e : args) {
+        user_assert(Int(32).can_represent(e.type()))
+            << "Args to a call to an Image must be representable as 32-bit integers.\n";
+        if (equal(e, _)) {
+            // Expand the _ into the appropriate number of implicit vars.
+            int missing_dimensions = dimensions() - (int)args.size() + 1;
+            for (int i = 0; i < missing_dimensions; i++) {
+                int_args.push_back(Var::implicit(i));
+            }
+        } else if (e.type() == Int(32)) {
+            int_args.push_back(e);
+        } else {
+            int_args.push_back(cast<int>(e));
+        }
+    }
+    return Internal::Call::make(*this, int_args);
 }
 
-
 }
diff --git a/src/Buffer.h b/src/Buffer.h
index aadbb09bb1f7..683f1ad3bcad 100644
--- a/src/Buffer.h
+++ b/src/Buffer.h
@@ -5,103 +5,37 @@
  * Defines Buffer - A c++ wrapper around a buffer_t.
  */
 
-#include <stdint.h>
-
-#include "runtime/HalideRuntime.h" // For buffer_t
-#include "IntrusivePtr.h"
-#include "Error.h"
-#include "Type.h"
+#include "runtime/HalideImage.h"
+#include "Expr.h"
+#include "Util.h"
 #include "Argument.h"
 
 namespace Halide {
 namespace Internal {
 struct BufferContents;
-struct JITModule;
 }
 
-/** The internal representation of an image, or other dense array
- * data. The Image type provides a typed view onto a buffer for the
- * purposes of direct manipulation. A buffer may be stored in main
- * memory, or some other memory space (e.g. a gpu). If you want to use
- * this as an Image, see the Image class. Casting a Buffer to an Image
- * will do any appropriate copy-back. This class is a fairly thin
- * wrapper on a buffer_t, which is the C-style type Halide uses for
- * passing buffers around.
- */
+/** A named reference-counted handle on an Image of unknown type and dimensionality */
 class Buffer {
 private:
     Internal::IntrusivePtr<Internal::BufferContents> contents;
 
 public:
     Buffer() : contents(nullptr) {}
+    EXPORT Buffer(const Image<void> &buf, std::string name = "");
+    EXPORT Buffer(Type t, const buffer_t &buf, std::string name = "");
 
-    EXPORT Buffer(Type t, int x_size = 0, int y_size = 0, int z_size = 0, int w_size = 0,
-                  uint8_t* data = nullptr, const std::string &name = "");
-
-    EXPORT Buffer(Type t, const std::vector<int32_t> &sizes,
-                  uint8_t* data = nullptr, const std::string &name = "");
-
-    EXPORT Buffer(Type t, const buffer_t *buf, const std::string &name = "");
+    template<typename T, int D> Buffer(const Image<T, D> &buf, std::string name = "") :
+        Buffer(Image<void>(buf), name) {}
 
-    /** Get a pointer to the host-side memory. */
-    EXPORT void *host_ptr() const;
-
-    /** Get a pointer to the raw buffer_t struct that this class wraps. */
-    EXPORT buffer_t *raw_buffer() const;
-
-    /** Get the device-side pointer/handle for this buffer. Will be
-     * zero if no device was involved in the creation of this
-     * buffer. */
-    EXPORT uint64_t device_handle() const;
-
-    /** Has this buffer been modified on the cpu since last copied to a
-     * device. Not meaningful unless there's a device involved. */
-    EXPORT bool host_dirty() const;
-
-    /** Let Halide know that the host-side memory backing this buffer
-     * has been externally modified. You shouldn't normally need to
-     * call this, because it is done for you when you cast a Buffer to
-     * an Image in order to modify it. */
-    EXPORT void set_host_dirty(bool dirty = true);
-
-    /** Has this buffer been modified on device since last copied to
-     * the cpu. Not meaninful unless there's a device involved. */
-    EXPORT bool device_dirty() const;
-
-    /** Let Halide know that the device-side memory backing this
-     * buffer has been externally modified, and so the cpu-side memory
-     * is invalid. A copy-back will occur the next time you cast this
-     * Buffer to an Image, or the next time this buffer is accessed on
-     * the host in a halide pipeline. */
-    EXPORT void set_device_dirty(bool dirty = true);
-
-    /** Get the dimensionality of this buffer. Uses the convention
-     * that the extent field of a buffer_t should contain zero when
-     * the dimensions end. */
-    EXPORT int dimensions() const;
-
-    /** Get the extent of this buffer in the given dimension. */
-    EXPORT int extent(int dim) const;
-
-    /** Get the distance in memory (measured in the type of the buffer
-     * elements, not bytes) between adjacent elements of this buffer
-     * along the given dimension. For the innermost dimension, this
-     * will usually be one. */
-    EXPORT int stride(int dim) const;
-
-    /** Get the coordinate in the function that this buffer represents
-     * that corresponds to the base address of the buffer. */
-    EXPORT int min(int dim) const;
-
-    /** Set the coordinate in the function that this buffer represents
-     * that corresponds to the base address of the buffer. */
-    EXPORT void set_min(int m0, int m1 = 0, int m2 = 0, int m3 = 0);
-
-    /** Get the Halide type of the contents of this buffer. */
-    EXPORT Type type() const;
+    EXPORT Buffer(Type t, const std::vector<int> &size, std::string name = "");
 
     /** Compare two buffers for identity (not equality of data). */
-    EXPORT bool same_as(const Buffer &other) const;
+    bool same_as(const Buffer &other) const;
+
+    /** Get the underlying Image */
+    EXPORT Image<void> &get();
+    EXPORT const Image<void> &get() const;
 
     /** Check if this buffer handle actually points to data. */
     EXPORT bool defined() const;
@@ -112,35 +46,58 @@ class Buffer {
     /** Convert this buffer to an argument to a halide pipeline. */
     EXPORT operator Argument() const;
 
-    /** If this buffer was created *on-device* by a jit-compiled
-     * realization, then copy it back to the cpu-side memory. This is
-     * usually achieved by casting the Buffer to an Image. */
-    EXPORT int copy_to_host();
-
-    /** If this buffer was created by a jit-compiled realization on a
-     * device-aware target (e.g. PTX), then copy the cpu-side data to
-     * the device-side allocation. TODO: I believe this currently
-     * aborts messily if no device-side allocation exists. You might
-     * think you want to do this because you've modified the data
-     * manually on the host before calling another Halide pipeline,
-     * but what you actually want to do in that situation is set the
-     * host_dirty bit so that Halide can manage the copy lazily for
-     * you. Casting the Buffer to an Image sets the dirty bit for
-     * you. */
-    EXPORT int copy_to_device();
-
-    /** If this buffer exists on a GPU, then finish any currently
-     * running computation on that GPU. Useful for benchmarking. */
-    EXPORT int device_sync();
-
-    /** If this buffer was created by a jit-compiled realization on a
-     * device-aware target (e.g. PTX), then free the device-side
-     * allocation, if there is one. Done automatically when the last
-     * reference to this buffer dies. */
-    EXPORT int free_dev_buffer();
+    /** Get the Halide type of the underlying buffer */
+    EXPORT Type type() const;
+
+    /** Get the dimensionality of the underlying buffer */
+    EXPORT int dimensions() const;
+
+    /** Get a dimension from the underlying buffer. */
+    EXPORT Image<void>::Dimension dim(int i) const;
 
+    /** Access to the mins, strides, extents. Will be deprecated. Do not use. */
+    // @{
+    int min(int i) const { return dim(i).min(); }
+    int extent(int i) const { return dim(i).extent(); }
+    int stride(int i) const { return dim(i).stride(); }
+    // @}
+
+    /** Get the size in bytes of the allocation */
+    EXPORT size_t size_in_bytes() const;
+
+    /** Get a pointer to the raw buffer */
+    EXPORT buffer_t *raw_buffer() const;
+
+    /** Get the host pointer */
+    EXPORT uint8_t *host_ptr() const;
+
+    /** Convert a buffer to a typed and dimensioned Image. Does
+     * runtime type checks. */
+    template<typename T, int D>
+    operator Image<T, D>() const {
+        return Image<T, D>(get());
+    }
+
+    /** Make a Call node to a specific site in this buffer. */
+    // @{
+    Expr operator()(const std::vector<Expr> &loc) const;
+
+    template<typename ...Args,
+             typename = std::enable_if<(Internal::all_are_convertible<Expr, Args...>::value)>>
+    Expr operator()(Expr first, Args... rest) const {
+        const std::vector<Expr> vec = {first, rest...};
+        return (*this)(vec);
+    }
+    // @}
 };
 
+/** An adaptor so that it's possible to access a Halide::Image using Exprs. */
+template<typename T, int D, typename ...Args,
+         typename = std::enable_if<(Internal::all_are_convertible<Expr, Args...>::value)>>
+NO_INLINE Expr image_accessor(const Image<T, D> &im, Expr first, Args... rest) {
+    return Buffer(im)(first, rest...);
+}
+
 }
 
 #endif
diff --git a/src/Closure.cpp b/src/Closure.cpp
index 88ee42fc2b59..63e91c2f92d8 100644
--- a/src/Closure.cpp
+++ b/src/Closure.cpp
@@ -48,14 +48,8 @@ void Closure::visit(const Load *op) {
 
         // If reading an image/buffer, compute the size.
         if (op->image.defined()) {
+            ref.size = op->image.size_in_bytes();
             ref.dimensions = op->image.dimensions();
-            // The size is the offset of one beyond the last element.
-            // TODO(abadams): replace this with halide_buffer_t::size_in_bytes.
-            ref.size = 1;
-            for (int i = 0; i < op->image.dimensions(); i++) {
-                ref.size += (op->image.extent(i) - 1)*op->image.stride(i);
-            }
-            ref.size *= op->image.type().bytes();
         }
     } else {
         debug(3) << "Not adding " << op->name << " to closure\n";
diff --git a/src/FastIntegerDivide.cpp b/src/FastIntegerDivide.cpp
index a1ee55c90647..98a16b163118 100644
--- a/src/FastIntegerDivide.cpp
+++ b/src/FastIntegerDivide.cpp
@@ -2,6 +2,7 @@
 
 #include "FastIntegerDivide.h"
 #include "IntegerDivisionTable.h"
+#include "IROperator.h"
 
 namespace Halide {
 
diff --git a/src/FastIntegerDivide.h b/src/FastIntegerDivide.h
index 9398528a211d..761462056daf 100644
--- a/src/FastIntegerDivide.h
+++ b/src/FastIntegerDivide.h
@@ -2,7 +2,6 @@
 #define HALIDE_FAST_INTEGER_DIVIDE_H
 
 #include "IR.h"
-#include "Image.h"
 
 namespace Halide {
 
diff --git a/src/Func.cpp b/src/Func.cpp
index 1275e1f3f90d..c84d665376f8 100644
--- a/src/Func.cpp
+++ b/src/Func.cpp
@@ -16,7 +16,6 @@
 #include "Function.h"
 #include "Argument.h"
 #include "Lower.h"
-#include "Image.h"
 #include "Param.h"
 #include "PrintLoopNest.h"
 #include "Debug.h"
@@ -2340,33 +2339,40 @@ Realization Func::realize(std::vector<int32_t> sizes, const Target &target) {
 }
 
 Realization Func::realize(int x_size, int y_size, int z_size, int w_size, const Target &target) {
-    user_assert(defined()) << "Can't realize undefined Func.\n";
-    vector<Buffer> outputs(func.outputs());
-    for (size_t i = 0; i < outputs.size(); i++) {
-        outputs[i] = Buffer(func.output_types()[i], x_size, y_size, z_size, w_size);
-    }
-    Realization r(outputs);
-    realize(r, target);
-    return r;
+    return realize({x_size, y_size, z_size, w_size}, target);
 }
 
 Realization Func::realize(int x_size, int y_size, int z_size, const Target &target) {
-    return realize(x_size, y_size, z_size, 0, target);
+    return realize({x_size, y_size, z_size}, target);
 }
 
 Realization Func::realize(int x_size, int y_size, const Target &target) {
-    return realize(x_size, y_size, 0, 0, target);
+    return realize({x_size, y_size}, target);
 }
 
 Realization Func::realize(int x_size, const Target &target) {
-    return realize(x_size, 0, 0, 0, target);
+    return realize(std::vector<int>{x_size}, target);
+}
+
+Realization Func::realize(const Target &target) {
+    return realize(std::vector<int>{}, target);
 }
 
 void Func::infer_input_bounds(int x_size, int y_size, int z_size, int w_size) {
     user_assert(defined()) << "Can't infer input bounds on an undefined Func.\n";
     vector<Buffer> outputs(func.outputs());
+    int sizes[] = {x_size, y_size, z_size, w_size};
     for (size_t i = 0; i < outputs.size(); i++) {
-        outputs[i] = Buffer(func.output_types()[i], x_size, y_size, z_size, w_size, (uint8_t *)1);
+        // We're not actually going to read from these outputs, so
+        // make the allocation tiny, then expand them with unsafe
+        // cropping.
+        Image<void> im = Image<void>::make_scalar(func.output_types()[i]);
+        for (int s : sizes) {
+            if (!s) break;
+            im.add_dimension();
+            im.crop(im.dimensions()-1, 0, s);
+        }
+        outputs[i] = im;
     }
     Realization r(outputs);
     infer_input_bounds(r);
@@ -2561,4 +2567,4 @@ EXPORT Var _("_");
 EXPORT Var _0("_0"), _1("_1"), _2("_2"), _3("_3"), _4("_4"),
            _5("_5"), _6("_6"), _7("_7"), _8("_8"), _9("_9");
 
-}
\ No newline at end of file
+}
diff --git a/src/Func.h b/src/Func.h
index 40567e47d621..9b6da713dcd7 100644
--- a/src/Func.h
+++ b/src/Func.h
@@ -14,7 +14,6 @@
 #include "Argument.h"
 #include "RDom.h"
 #include "JITModule.h"
-#include "Image.h"
 #include "Target.h"
 #include "Tuple.h"
 #include "Module.h"
@@ -469,6 +468,12 @@ class Func {
      * Function object. */
     EXPORT explicit Func(Internal::Function f);
 
+    /** Construct a new Func to wrap an Image. */
+    template<typename T, int D>
+    NO_INLINE explicit Func(const Image<T, D> &im) : Func() {
+        (*this)(_) = im(_);
+    }
+
     /** Evaluate this function over some rectangular domain and return
      * the resulting buffer or buffers. Performs compilation if the
      * Func has not previously been realized and jit_compile has not
@@ -509,8 +514,9 @@ class Func {
                                const Target &target = Target());
     EXPORT Realization realize(int x_size, int y_size,
                                const Target &target = Target());
-    EXPORT Realization realize(int x_size = 0,
+    EXPORT Realization realize(int x_size,
                                const Target &target = Target());
+    EXPORT Realization realize(const Target &target = Target());
     // @}
 
     /** Evaluate this function into an existing allocated buffer or
@@ -522,11 +528,9 @@ class Func {
     EXPORT void realize(Realization dst, const Target &target = Target());
     EXPORT void realize(Buffer dst, const Target &target = Target());
 
-    template<typename T>
-    NO_INLINE void realize(Image<T> dst, const Target &target = Target()) {
-        // Images are expected to exist on-host.
+    template<typename T, int D>
+    NO_INLINE void realize(Image<T, D> &dst, const Target &target = Target()) {
         realize(Buffer(dst), target);
-        dst.copy_to_host();
     }
     // @}
 
@@ -539,6 +543,17 @@ class Func {
     EXPORT void infer_input_bounds(int x_size = 0, int y_size = 0, int z_size = 0, int w_size = 0);
     EXPORT void infer_input_bounds(Realization dst);
     EXPORT void infer_input_bounds(Buffer dst);
+
+    template<typename T, int D>
+    NO_INLINE void infer_input_bounds(Image<T, D> &im) {
+        // It's possible for bounds inference to also manipulate
+        // output buffers if their host pointer is null, so we must
+        // take Images by reference and communicate the bounds query
+        // result by modifying the argument.
+        Buffer b(im);
+        infer_input_bounds(b);
+        im = b.get();
+    }
     // @}
 
     /** Statically compile this function to llvm bitcode, with the
@@ -1834,7 +1849,7 @@ NO_INLINE T evaluate(Expr e) {
     Func f;
     f() = e;
     Image<T> im = f.realize();
-    return im(0);
+    return im();
 }
 
 /** JIT-compile and run enough code to evaluate a Halide Tuple. */
@@ -1853,8 +1868,8 @@ NO_INLINE void evaluate(Tuple t, A *a, B *b) {
     Func f;
     f() = t;
     Realization r = f.realize();
-    *a = Image<A>(r[0])(0);
-    *b = Image<B>(r[1])(0);
+    *a = Image<A>(r[0])();
+    *b = Image<B>(r[1])();
 }
 
 template<typename A, typename B, typename C>
@@ -1875,9 +1890,9 @@ NO_INLINE void evaluate(Tuple t, A *a, B *b, C *c) {
     Func f;
     f() = t;
     Realization r = f.realize();
-    *a = Image<A>(r[0])(0);
-    *b = Image<B>(r[1])(0);
-    *c = Image<C>(r[2])(0);
+    *a = Image<A>(r[0])();
+    *b = Image<B>(r[1])();
+    *c = Image<C>(r[2])();
 }
 
 template<typename A, typename B, typename C, typename D>
@@ -1902,10 +1917,10 @@ NO_INLINE void evaluate(Tuple t, A *a, B *b, C *c, D *d) {
     Func f;
     f() = t;
     Realization r = f.realize();
-    *a = Image<A>(r[0])(0);
-    *b = Image<B>(r[1])(0);
-    *c = Image<C>(r[2])(0);
-    *d = Image<D>(r[3])(0);
+    *a = Image<A>(r[0])();
+    *b = Image<B>(r[1])();
+    *c = Image<C>(r[2])();
+    *d = Image<D>(r[3])();
 }
  // @}
 
@@ -1938,7 +1953,7 @@ NO_INLINE T evaluate_may_gpu(Expr e) {
     f() = e;
     Internal::schedule_scalar(f);
     Image<T> im = f.realize();
-    return im(0);
+    return im();
 }
 
 /** JIT-compile and run enough code to evaluate a Halide Tuple. Can
@@ -1959,8 +1974,8 @@ NO_INLINE void evaluate_may_gpu(Tuple t, A *a, B *b) {
     f() = t;
     Internal::schedule_scalar(f);
     Realization r = f.realize();
-    *a = Image<A>(r[0])(0);
-    *b = Image<B>(r[1])(0);
+    *a = Image<A>(r[0])();
+    *b = Image<B>(r[1])();
 }
 
 template<typename A, typename B, typename C>
@@ -1981,9 +1996,9 @@ NO_INLINE void evaluate_may_gpu(Tuple t, A *a, B *b, C *c) {
     f() = t;
     Internal::schedule_scalar(f);
     Realization r = f.realize();
-    *a = Image<A>(r[0])(0);
-    *b = Image<B>(r[1])(0);
-    *c = Image<C>(r[2])(0);
+    *a = Image<A>(r[0])();
+    *b = Image<B>(r[1])();
+    *c = Image<C>(r[2])();
 }
 
 template<typename A, typename B, typename C, typename D>
@@ -2009,10 +2024,10 @@ NO_INLINE void evaluate_may_gpu(Tuple t, A *a, B *b, C *c, D *d) {
     f() = t;
     Internal::schedule_scalar(f);
     Realization r = f.realize();
-    *a = Image<A>(r[0])(0);
-    *b = Image<B>(r[1])(0);
-    *c = Image<C>(r[2])(0);
-    *d = Image<D>(r[3])(0);
+    *a = Image<A>(r[0])();
+    *b = Image<B>(r[1])();
+    *c = Image<C>(r[2])();
+    *d = Image<D>(r[3])();
 }
 // @}
 
diff --git a/src/Function.h b/src/Function.h
index 571e20bcb370..3033efe0a739 100644
--- a/src/Function.h
+++ b/src/Function.h
@@ -34,6 +34,9 @@ struct ExternFuncArgument {
 
     ExternFuncArgument(Buffer b): arg_type(BufferArg), buffer(b) {}
 
+    template<typename T, int D>
+    ExternFuncArgument(const Image<T, D> &im) : arg_type(BufferArg), buffer(im) {}
+
     ExternFuncArgument(Expr e): arg_type(ExprArg), expr(e) {}
     ExternFuncArgument(int e): arg_type(ExprArg), expr(e) {}
     ExternFuncArgument(float e): arg_type(ExprArg), expr(e) {}
diff --git a/src/HexagonOffload.cpp b/src/HexagonOffload.cpp
index cc239be410ef..17bf970d5e3c 100644
--- a/src/HexagonOffload.cpp
+++ b/src/HexagonOffload.cpp
@@ -3,15 +3,15 @@
 #include <memory>
 
 #include "HexagonOffload.h"
-#include "IRMutator.h"
-#include "Substitute.h"
 #include "Closure.h"
-#include "Param.h"
-#include "Image.h"
-#include "LLVM_Output.h"
-#include "RemoveTrivialForLoops.h"
 #include "InjectHostDevBufferCopies.h"
+#include "IRMutator.h"
+#include "IROperator.h"
+#include "LLVM_Output.h"
 #include "LLVM_Headers.h"
+#include "Param.h"
+#include "RemoveTrivialForLoops.h"
+#include "Substitute.h"
 
 namespace Halide {
 namespace Internal {
@@ -85,9 +85,10 @@ class InjectHexagonRpc : public IRMutator {
     Expr state_var(const std::string& name, Type type) {
         Expr& var = state_vars[name];
         if (!var.defined()) {
-            Buffer storage(type, {}, nullptr, name + "_buf");
-            *(void **)storage.host_ptr() = nullptr;
-            var = Load::make(type_of<void*>(), name + "_buf", 0, storage, Parameter());
+            Image<void *> storage = Image<void *>::make_scalar();
+            storage() = nullptr;
+            Buffer buf(storage, name + "_buf");
+            var = Load::make(type_of<void*>(), name + "_buf", 0, buf, Parameter());
         }
         return var;
     }
@@ -108,10 +109,10 @@ class InjectHexagonRpc : public IRMutator {
     // Create a Buffer containing the given buffer/size, and return an
     // expression for a pointer to the first element.
     Expr buffer_ptr(const uint8_t* buffer, size_t size, const char* name) {
-        Buffer code(type_of<uint8_t>(), {(int)size}, nullptr, name);
-        memcpy(code.host_ptr(), buffer, (int)size);
-
-        Expr ptr_0 = Load::make(type_of<uint8_t>(), name, 0, code, Parameter());
+        Image<uint8_t> code((int)size);
+        memcpy(code.data(), buffer, (int)size);
+        Buffer buf(code, name);
+        Expr ptr_0 = Load::make(type_of<uint8_t>(), name, 0, buf, Parameter());
         return Call::make(Handle(), Call::address_of, {ptr_0}, Call::Intrinsic);
     }
 
diff --git a/src/Image.h b/src/Image.h
deleted file mode 100644
index e23cd499f62d..000000000000
--- a/src/Image.h
+++ /dev/null
@@ -1,304 +0,0 @@
-#ifndef HALIDE_IMAGE_H
-#define HALIDE_IMAGE_H
-
-/** \file
- * Defines Halide's Image data type
- */
-
-#include "Var.h"
-#include "Tuple.h"
-#include "Target.h"
-
-namespace Halide {
-
-/** A base class for Images, which are typed accessors on
- * Buffers. This exists to make the implementations of certain methods
- * of Image private, so that they can safely throw errors without the
- * risk of being inlined (which in turns messes up reporting of line
- * numbers). */
-class ImageBase {
-protected:
-    /** The underlying memory object */
-    Buffer buffer;
-
-    /** The address of the zero coordinate. The buffer_t stores the
-     * address of the min coordinate, but it's easier to index off the
-     * zero coordinate. */
-    void *origin;
-
-    /** The strides. These fields are also stored in the buffer, but
-     * they're cached here in the handle to make operator() fast. This
-     * is safe to do because the buffer is never modified.
-     */
-    int stride_0, stride_1, stride_2, stride_3;
-
-    /** The dimensionality. */
-    int dims;
-
-    /** The size of each element. */
-    int elem_size;
-
-    /** Prepare the buffer to be used as an image. Makes sure that the
-     * cached strides are correct, and that the image data is on the
-     * host. */
-    void prepare_for_direct_pixel_access();
-
-    bool add_implicit_args_if_placeholder(std::vector<Expr> &args,
-                                          Expr last_arg,
-                                          int total_args,
-                                          bool placeholder_seen) const;
-public:
-    /** Construct an undefined image handle */
-    ImageBase() : origin(nullptr), stride_0(0), stride_1(0), stride_2(0), stride_3(0), dims(0) {}
-
-    /** Allocate an image with the given dimensions. */
-    EXPORT ImageBase(Type t, int x, int y = 0, int z = 0, int w = 0, const std::string &name = "");
-
-    /** Wrap a buffer in an Image object, so that we can directly
-     * access its pixels in a type-safe way. */
-    EXPORT ImageBase(Type t, const Buffer &buf);
-
-    /** Wrap a single-element realization in an Image object. */
-    EXPORT ImageBase(Type t, const Realization &r);
-
-    /** Wrap a buffer_t in an Image object, so that we can access its
-     * pixels. */
-    EXPORT ImageBase(Type t, const buffer_t *b, const std::string &name = "");
-
-    /** Get the name of this image. */
-    EXPORT const std::string &name() const;
-
-    /** Manually copy-back data to the host, if it's on a device. This
-     * is done for you if you construct an image from a buffer, but
-     * you might need to call this if you realize a gpu kernel into an
-     * existing image */
-    EXPORT void copy_to_host();
-
-    /** Mark the buffer as dirty-on-host.  is done for you if you
-     * construct an image from a buffer, but you might need to call
-     * this if you realize a gpu kernel into an existing image, or
-     * modify the data via some other back-door. */
-    EXPORT void set_host_dirty(bool dirty = true);
-
-    /** Check if this image handle points to actual data */
-    EXPORT bool defined() const;
-
-    /** Get the dimensionality of the data. Typically two for grayscale images, and three for color images. */
-    EXPORT int dimensions() const;
-
-    /** Get the size of a dimension */
-    EXPORT int extent(int dim) const;
-
-    /** Get the min coordinate of a dimension. The top left of the
-     * image represents this point in a function that was realized
-     * into this image. */
-    EXPORT int min(int dim) const;
-
-    /** Set the min coordinates of a dimension. */
-    EXPORT void set_min(int m0, int m1 = 0, int m2 = 0, int m3 = 0);
-
-    /** Get the number of elements in the buffer between two adjacent
-     * elements in the given dimension. For example, the stride in
-     * dimension 0 is usually 1, and the stride in dimension 1 is
-     * usually the extent of dimension 0. This is not necessarily true
-     * though. */
-    EXPORT int stride(int dim) const;
-
-    /** Get the extent of dimension 0, which by convention we use as
-     * the width of the image. Unlike extent(0), returns one if the
-     * buffer is zero-dimensional. */
-    EXPORT int width() const;
-
-    /** Get the extent of dimension 1, which by convention we use as
-     * the height of the image. Unlike extent(1), returns one if the
-     * buffer has fewer than two dimensions. */
-    EXPORT int height() const;
-
-    /** Get the extent of dimension 2, which by convention we use as
-     * the number of color channels (often 3). Unlike extent(2),
-     * returns one if the buffer has fewer than three dimensions. */
-    EXPORT int channels() const;
-
-    /** Get the minimum coordinate in dimension 0, which by convention
-     * is the coordinate of the left edge of the image. Returns zero
-     * for zero-dimensional images. */
-    EXPORT int left() const;
-
-    /** Get the maximum coordinate in dimension 0, which by convention
-     * is the coordinate of the right edge of the image. Returns zero
-     * for zero-dimensional images. */
-    EXPORT int right() const;
-
-    /** Get the minimum coordinate in dimension 1, which by convention
-     * is the top of the image. Returns zero for zero- or
-     * one-dimensional images. */
-    EXPORT int top() const;
-
-    /** Get the maximum coordinate in dimension 1, which by convention
-     * is the bottom of the image. Returns zero for zero- or
-     * one-dimensional images. */
-    EXPORT int bottom() const;
-
-    /** Construct an expression which loads from this image. The
-     * location is extended with enough implicit variables to match
-     * the dimensionality of the image (see \ref Var::implicit) */
-    // @{
-    EXPORT Expr operator()() const;
-    EXPORT Expr operator()(Expr x) const;
-    EXPORT Expr operator()(Expr x, Expr y) const;
-    EXPORT Expr operator()(Expr x, Expr y, Expr z) const;
-    EXPORT Expr operator()(Expr x, Expr y, Expr z, Expr w) const;
-    EXPORT Expr operator()(std::vector<Expr>) const;
-    EXPORT Expr operator()(std::vector<Var>) const;
-    // @}
-
-    /** Get a pointer to the raw buffer_t that this image holds */
-    EXPORT buffer_t *raw_buffer() const;
-
-    /** Get the address of a particular pixel. */
-    void *address_of(int x, int y = 0, int z = 0, int w = 0) const {
-        uint8_t *ptr = (uint8_t *)origin;
-        ptrdiff_t offset = ((ptrdiff_t)x*stride_0 +
-                            (ptrdiff_t)y*stride_1 +
-                            (ptrdiff_t)z*stride_2 +
-                            (ptrdiff_t)w*stride_3);
-        return (void *)(ptr + offset * elem_size);
-    }
-};
-
-/** A reference-counted handle on a dense multidimensional array
- * containing scalar values of type T. Can be directly accessed and
- * modified. May have up to four dimensions. Color images are
- * represented as three-dimensional, with the third dimension being
- * the color channel. In general we store color images in
- * color-planes, as opposed to packed RGB, because this tends to
- * vectorize more cleanly. */
-template<typename T>
-class Image : public ImageBase {
-public:
-    typedef T ElemType;
-
-    /** Construct an undefined image handle */
-    Image() : ImageBase() {}
-
-    /** Allocate an image with the given dimensions. */
-    // @{
-    NO_INLINE Image(int x, int y = 0, int z = 0, int w = 0, const std::string &name = "") :
-        ImageBase(type_of<T>(), x, y, z, w, name) {}
-
-    NO_INLINE Image(int x, int y, int z, const std::string &name) :
-        ImageBase(type_of<T>(), x, y, z, 0, name) {}
-
-    NO_INLINE Image(int x, int y, const std::string &name) :
-        ImageBase(type_of<T>(), x, y, 0, 0, name) {}
-
-    NO_INLINE Image(int x, const std::string &name) :
-        ImageBase(type_of<T>(), x, 0, 0, 0, name) {}
-    // @}
-
-    /** Wrap a buffer in an Image object, so that we can directly
-     * access its pixels in a type-safe way. */
-    NO_INLINE Image(const Buffer &buf) : ImageBase(type_of<T>(), buf) {}
-
-    /** Wrap a single-element realization in an Image object. */
-    NO_INLINE Image(const Realization &r) : ImageBase(type_of<T>(), r) {}
-
-    /** Wrap a buffer_t in an Image object, so that we can access its
-     * pixels. */
-    NO_INLINE Image(const buffer_t *b, const std::string &name = "") :
-        ImageBase(type_of<T>(), b, name) {}
-
-    /** Get a pointer to the element at the min location. */
-    NO_INLINE T *data() const {
-        user_assert(defined()) << "data of undefined Image\n";
-        return (T *)buffer.host_ptr();
-    }
-
-    using ImageBase::operator();
-
-    /** Assuming this image is one-dimensional, get the value of the
-     * element at position x */
-    const T &operator()(int x) const {
-        return *((T *)(address_of(x)));
-    }
-
-    /** Assuming this image is two-dimensional, get the value of the
-     * element at position (x, y) */
-    const T &operator()(int x, int y) const {
-        return *((T *)(address_of(x, y)));
-    }
-
-    /** Assuming this image is three-dimensional, get the value of the
-     * element at position (x, y, z) */
-    const T &operator()(int x, int y, int z) const {
-        return *((T *)(address_of(x, y, z)));
-    }
-
-    /** Assuming this image is four-dimensional, get the value of the
-     * element at position (x, y, z, w) */
-    const T &operator()(int x, int y, int z, int w) const {
-        return *((T *)(address_of(x, y, z, w)));
-    }
-
-    /** Assuming this image is one-dimensional, get a reference to the
-     * element at position x */
-    T &operator()(int x) {
-        return *((T *)(address_of(x)));
-    }
-
-    /** Assuming this image is two-dimensional, get a reference to the
-     * element at position (x, y) */
-    T &operator()(int x, int y) {
-        return *((T *)(address_of(x, y)));
-    }
-
-    /** Assuming this image is three-dimensional, get a reference to the
-     * element at position (x, y, z) */
-    T &operator()(int x, int y, int z) {
-        return *((T *)(address_of(x, y, z)));
-    }
-
-    /** Assuming this image is four-dimensional, get a reference to the
-     * element at position (x, y, z, w) */
-    T &operator()(int x, int y, int z, int w) {
-        return *((T *)(address_of(x, y, z, w)));
-    }
-
-    /** Get a handle on the Buffer that this image holds */
-    operator Buffer() const {
-        return buffer;
-    }
-
-    /** Convert this image to an argument to a halide pipeline. */
-    operator Argument() const {
-        return Argument(buffer);
-    }
-
-    /** Convert this image to an argument to an extern stage. */
-    operator ExternFuncArgument() const {
-        return ExternFuncArgument(buffer);
-    }
-
-    /** Treating the image as an Expr is equivalent to call it with no
-     * arguments. For example, you can say:
-     *
-     \code
-     Image im(10, 10);
-     Func f;
-     f = im*2;
-     \endcode
-     *
-     * This will define f as a two-dimensional function with value at
-     * position (x, y) equal to twice the value of the image at the
-     * same location.
-     */
-    operator Expr() const {
-        return (*this)(_);
-    }
-
-
-};
-
-}
-
-#endif
diff --git a/src/ImageParam.h b/src/ImageParam.h
index 4c40d55c6c6e..832e1266c514 100644
--- a/src/ImageParam.h
+++ b/src/ImageParam.h
@@ -37,7 +37,13 @@ class ImageParam : public OutputImageParam {
     EXPORT ImageParam(Type t, int d, const std::string &n);
 
     /** Bind a buffer or image to this ImageParam. Only relevant for jitting */
+    // @{
     EXPORT void set(Buffer b);
+    template<typename T, int D>
+    NO_INLINE void set(const Image<T, D> &im) {
+        set(Buffer(im));
+    }
+    // @}
 
     /** Get the buffer bound to this ImageParam. Only relevant for jitting */
     EXPORT Buffer get() const;
@@ -49,7 +55,7 @@ class ImageParam : public OutputImageParam {
      */
     // @{
     template <typename... Args>
-    Expr operator()(Args&&... args) const {
+    NO_INLINE Expr operator()(Args&&... args) const {
         return func(std::forward<Args>(args)...);
     }
     EXPORT Expr operator()(std::vector<Expr>) const;
diff --git a/src/Introspection.cpp b/src/Introspection.cpp
index 63e15bbcfdaa..8ecca69d23a3 100644
--- a/src/Introspection.cpp
+++ b/src/Introspection.cpp
@@ -712,7 +712,7 @@ class DebugSections {
 
     // Look up n stack frames and get the source location as filename:line
     std::string get_source_location() {
-
+        return "";
         debug(5) << "Finding source location\n";
 
         if (!source_lines.size()) {
diff --git a/src/Module.cpp b/src/Module.cpp
index 01ca74c5727a..871ed39cb2d8 100644
--- a/src/Module.cpp
+++ b/src/Module.cpp
@@ -23,16 +23,16 @@ namespace {
 class TemporaryObjectFileDir final {
 public:
     TemporaryObjectFileDir() : dir_path(dir_make_temp()) {}
-    ~TemporaryObjectFileDir() { 
+    ~TemporaryObjectFileDir() {
         for (const auto &f : dir_files) {
             debug(1) << "file_unlink: " << f << "\n";
             file_unlink(f);
         }
         debug(1) << "dir_rmdir: " << dir_path << "\n";
-        dir_rmdir(dir_path); 
+        dir_rmdir(dir_path);
     }
-    std::string add_temp_object_file(const std::string &base_path_name, 
-                                     const std::string &suffix, 
+    std::string add_temp_object_file(const std::string &base_path_name,
+                                     const std::string &suffix,
                                      const Target &target,
                                      bool in_front = false) {
         const char* ext = target.os == Target::Windows && !target.has_feature(Target::MinGW) ? ".obj" : ".o";
@@ -254,7 +254,7 @@ void Module::compile(const Outputs &output_files) const {
 Outputs compile_standalone_runtime(const Outputs &output_files, Target t) {
     Module empty("standalone_runtime", t.without_feature(Target::NoRuntime).without_feature(Target::JIT));
     // For runtime, it only makes sense to output object files or static_library, so ignore
-    // everything else. 
+    // everything else.
     Outputs actual_outputs = Outputs().object(output_files.object_name).static_library(output_files.static_library_name);
     empty.compile(actual_outputs);
     return actual_outputs;
@@ -264,9 +264,9 @@ void compile_standalone_runtime(const std::string &object_filename, Target t) {
     compile_standalone_runtime(Outputs().object(object_filename), t);
 }
 
-void compile_multitarget(const std::string &fn_name, 
+void compile_multitarget(const std::string &fn_name,
                          const Outputs &output_files,
-                         const std::vector<Target> &targets, 
+                         const std::vector<Target> &targets,
                          ModuleProducer module_producer) {
     user_assert(!fn_name.empty()) << "Function name must be specified.\n";
     user_assert(!targets.empty()) << "Must specify at least one target.\n";
@@ -356,7 +356,7 @@ void compile_multitarget(const std::string &fn_name,
     // and add that to the result.
     if (!base_target.has_feature(Target::NoRuntime)) {
         const Target runtime_target = base_target.without_feature(Target::NoRuntime);
-        compile_standalone_runtime(Outputs().object(temp_dir.add_temp_object_file(output_files.static_library_name, "_runtime", runtime_target)), 
+        compile_standalone_runtime(Outputs().object(temp_dir.add_temp_object_file(output_files.static_library_name, "_runtime", runtime_target)),
             runtime_target);
     }
 
@@ -386,7 +386,7 @@ void compile_multitarget(const std::string &fn_name,
     // may get optimized away at link time.
     wrapper_module.compile(Outputs().object(temp_dir.add_temp_object_file(output_files.static_library_name, "_wrapper", base_target, /* in_front*/ true)));
 
-    if (!output_files.c_header_name.empty()) { 
+    if (!output_files.c_header_name.empty()) {
         debug(1) << "compile_multitarget: c_header_name " << output_files.c_header_name << "\n";
         wrapper_module.compile(Outputs().c_header(output_files.c_header_name));
     }
diff --git a/src/Module.h b/src/Module.h
index 144065ca33e4..989f36ddaa8b 100644
--- a/src/Module.h
+++ b/src/Module.h
@@ -7,7 +7,7 @@
  */
 
 #include <functional>
- 
+
 #include "IR.h"
 #include "Buffer.h"
 #include "ModulusRemainder.h"
@@ -110,9 +110,9 @@ EXPORT Outputs compile_standalone_runtime(const Outputs &output_files, Target t)
 
 typedef std::function<Module(const std::string &, const Target &)> ModuleProducer;
 
-EXPORT void compile_multitarget(const std::string &fn_name, 
+EXPORT void compile_multitarget(const std::string &fn_name,
                                 const Outputs &output_files,
-                                const std::vector<Target> &targets, 
+                                const std::vector<Target> &targets,
                                 ModuleProducer module_producer);
 
 }
diff --git a/src/OutputImageParam.cpp b/src/OutputImageParam.cpp
index d30944dd5269..a093ac71afee 100644
--- a/src/OutputImageParam.cpp
+++ b/src/OutputImageParam.cpp
@@ -1,5 +1,5 @@
 #include "OutputImageParam.h"
-
+#include "IROperator.h"
 
 namespace Halide {
 
@@ -19,88 +19,119 @@ bool OutputImageParam::defined() const {
     return param.defined();
 }
 
-Expr OutputImageParam::min(int x) const {
+OutputImageParam::Dimension OutputImageParam::dim(int i) {
+    user_assert(defined())
+        << "Can't access the dimensions of an undefined ImageParam\n";
+    user_assert(i >= 0 && i < dimensions())
+        << "Can't access dimension " << i
+        << " of a " << dimensions() << "-dimensional ImageParam\n";
+    return OutputImageParam::Dimension(param, i);
+}
+
+const OutputImageParam::Dimension OutputImageParam::dim(int i) const {
+    user_assert(defined())
+        << "Can't access the dimensions of an undefined ImageParam\n";
+    user_assert(i >= 0 && i < dimensions())
+        << "Can't access dimension " << i
+        << " of a " << dimensions() << "-dimensional ImageParam\n";
+    return OutputImageParam::Dimension(param, i);
+}
+
+Expr OutputImageParam::Dimension::min() const {
     std::ostringstream s;
-    s << name() << ".min." << x;
+    s << param.name() << ".min." << d;
     return Internal::Variable::make(Int(32), s.str(), param);
 }
 
-Expr OutputImageParam::extent(int x) const {
+Expr OutputImageParam::Dimension::extent() const {
     std::ostringstream s;
-    s << name() << ".extent." << x;
+    s << param.name() << ".extent." << d;
     return Internal::Variable::make(Int(32), s.str(), param);
 }
 
-Expr OutputImageParam::stride(int x) const {
+Expr OutputImageParam::Dimension::max() const {
+    return min() + extent() - 1;
+}
+
+Expr OutputImageParam::Dimension::stride() const {
     std::ostringstream s;
-    s << name() << ".stride." << x;
+    s << param.name() << ".stride." << d;
     return Internal::Variable::make(Int(32), s.str(), param);
 }
 int OutputImageParam::host_alignment() const {
     return param.host_alignment();
 }
 
-OutputImageParam &OutputImageParam::set_extent(int dim, Expr extent) {
-    param.set_extent_constraint(dim, extent);
+OutputImageParam::Dimension OutputImageParam::Dimension::set_extent(Expr extent) {
+    param.set_extent_constraint(d, extent);
     return *this;
 }
 
-OutputImageParam &OutputImageParam::set_min(int dim, Expr min) {
-    param.set_min_constraint(dim, min);
+OutputImageParam::Dimension OutputImageParam::Dimension::set_min(Expr min) {
+    param.set_min_constraint(d, min);
     return *this;
 }
 
-OutputImageParam &OutputImageParam::set_stride(int dim, Expr stride) {
-    param.set_stride_constraint(dim, stride);
+OutputImageParam::Dimension OutputImageParam::Dimension::set_stride(Expr stride) {
+    param.set_stride_constraint(d, stride);
     return *this;
 }
 
+
+OutputImageParam::Dimension OutputImageParam::Dimension::set_bounds(Expr min, Expr extent) {
+    return set_min(min).set_extent(extent);
+}
+
+OutputImageParam::Dimension OutputImageParam::Dimension::dim(int i) {
+    return OutputImageParam::Dimension(param, i);
+}
+
+const OutputImageParam::Dimension OutputImageParam::Dimension::dim(int i) const {
+    return OutputImageParam::Dimension(param, i);
+}
+
 OutputImageParam &OutputImageParam::set_host_alignment(int bytes) {
     param.set_host_alignment(bytes);
     return *this;
 }
 
-OutputImageParam &OutputImageParam::set_bounds(int dim, Expr min, Expr extent) {
-    return set_min(dim, min).set_extent(dim, extent);
-}
-
 int OutputImageParam::dimensions() const {
     return param.dimensions();
 }
 
 Expr OutputImageParam::left() const {
     user_assert(dimensions() > 0) << "Can't ask for the left of a zero-dimensional image\n";
-    return min(0);
+    return dim(0).min();
 }
 
 Expr OutputImageParam::right() const {
     user_assert(dimensions() > 0) << "Can't ask for the right of a zero-dimensional image\n";
-    return Internal::Add::make(min(0), Internal::Sub::make(extent(0), 1));
+    return dim(0).max();
 }
 
 Expr OutputImageParam::top() const {
     user_assert(dimensions() > 1) << "Can't ask for the top of a zero- or one-dimensional image\n";
-    return min(1);
+    return dim(1).min();
 }
 
 Expr OutputImageParam::bottom() const {
     user_assert(dimensions() > 1) << "Can't ask for the bottom of a zero- or one-dimensional image\n";
-    return Internal::Add::make(min(1), Internal::Sub::make(extent(1), 1));
+    return dim(1).max();
 }
 
 Expr OutputImageParam::width() const {
     user_assert(dimensions() > 0) << "Can't ask for the width of a zero-dimensional image\n";
-    return extent(0);
+    return dim(0).extent();
 }
 
 Expr OutputImageParam::height() const {
     user_assert(dimensions() > 1) << "Can't ask for the height of a zero or one-dimensional image\n";
-    return extent(1);
+    return dim(1).extent();
 }
 
 Expr OutputImageParam::channels() const {
     user_assert(dimensions() > 2) << "Can't ask for the channels of an image with fewer than three dimensions\n";
-    return extent(2);
+    return dim(2).extent();
 }
 
 Internal::Parameter OutputImageParam::parameter() const {
diff --git a/src/OutputImageParam.h b/src/OutputImageParam.h
index 7956f278ac4d..960c444582c2 100644
--- a/src/OutputImageParam.h
+++ b/src/OutputImageParam.h
@@ -20,9 +20,89 @@ class OutputImageParam {
     /** Is this an input or an output? OutputImageParam is the base class for both. */
     Argument::Kind kind;
 
+    void add_implicit_args_if_placeholder(std::vector<Expr> &args,
+                                          Expr last_arg,
+                                          int total_args,
+                                          bool *placeholder_seen) const;
 public:
 
-    /** Construct a nullptr image parameter handle. */
+    struct Dimension {
+        /** Get an expression representing the minimum coordinates of this image
+         * parameter in the given dimension. */
+        EXPORT Expr min() const;
+
+        /** Get an expression representing the extent of this image
+         * parameter in the given dimension */
+        EXPORT Expr extent() const;
+
+        /** Get an expression representing the maximum coordinates of
+         * this image parameter in the given dimension. */
+        EXPORT Expr max() const;
+
+        /** Get an expression representing the stride of this image in the
+         * given dimension */
+        EXPORT Expr stride() const;
+
+        /** Set the min in a given dimension to equal the given
+         * expression. Setting the mins to zero may simplify some
+         * addressing math. */
+        EXPORT Dimension set_min(Expr e);
+
+        /** Set the extent in a given dimension to equal the given
+         * expression. Images passed in that fail this check will generate
+         * a runtime error. Returns a reference to the ImageParam so that
+         * these calls may be chained.
+         *
+         * This may help the compiler generate better
+         * code. E.g:
+         \code
+         im.dim(0).set_extent(100);
+         \endcode
+         * tells the compiler that dimension zero must be of extent 100,
+         * which may result in simplification of boundary checks. The
+         * value can be an arbitrary expression:
+         \code
+         im.dim(0).set_extent(im.dim(1).extent());
+         \endcode
+         * declares that im is a square image (of unknown size), whereas:
+         \code
+         im.dim(0).set_extent((im.dim(0).extent()/32)*32);
+         \endcode
+         * tells the compiler that the extent is a multiple of 32. */
+        EXPORT Dimension set_extent(Expr e);
+
+        /** Set the stride in a given dimension to equal the given
+         * value. This is particularly helpful to set when
+         * vectorizing. Known strides for the vectorized dimension
+         * generate better code. */
+        EXPORT Dimension set_stride(Expr e);
+
+        /** Set the min and extent in one call. */
+        EXPORT Dimension set_bounds(Expr min, Expr extent);
+
+        /** Get a different dimension of the same buffer */
+        // @{
+        EXPORT Dimension dim(int i);
+        EXPORT const Dimension dim(int i) const;
+        // @}
+
+    private:
+        friend class OutputImageParam;
+
+        /** Construct a Dimension representing dimension d of some
+         * Internal::Parameter p. Only OutputImageParam may construct
+         * these. */
+        Dimension(const Internal::Parameter &p, int d) : param(p), d(d) {}
+
+        /** Only OutputImageParam may copy these, too. This prevents
+         * users removing constness by making a non-const copy. */
+        Dimension(const Dimension &) = default;
+
+        Internal::Parameter param;
+        int d;
+    };
+
+    /** Construct a null image parameter handle. */
     OutputImageParam() {}
 
     /** Construct an OutputImageParam that wraps an Internal Parameter object. */
@@ -37,66 +117,32 @@ class OutputImageParam {
     /** Is this parameter handle non-nullptr */
     EXPORT bool defined() const;
 
-    /** Get an expression representing the minimum coordinates of this image
-     * parameter in the given dimension. */
-    EXPORT Expr min(int x) const;
-
-    /** Get an expression representing the extent of this image
-     * parameter in the given dimension */
-    EXPORT Expr extent(int x) const;
-
-    /** Get an expression representing the stride of this image in the
-     * given dimension */
-    EXPORT Expr stride(int x) const;
-
-    /** Get the ailgnment of the host pointer. Use set_host_alignment
-     * to change the default value of 1. */
+    /** Get a handle on one of the dimensions for the purposes of
+     * inspecting or constraining its min, extent, or stride. */
+    EXPORT Dimension dim(int i);
+
+    /** Get a handle on one of the dimensions for the purposes of
+     * inspecting its min, extent, or stride. */
+    EXPORT const Dimension dim(int i) const;
+
+    /** Get or constrain the shape of the dimensions. Soon to be
+     * deprecated. Do not use. */
+    // @{
+    OutputImageParam set_min(int i, Expr e) {dim(i).set_min(e); return *this;}
+    OutputImageParam set_extent(int i, Expr e) {dim(i).set_extent(e); return *this;}
+    OutputImageParam set_bounds(int i, Expr a, Expr b) {dim(i).set_bounds(a, b); return *this;}
+    OutputImageParam set_stride(int i, Expr e) {dim(i).set_stride(e); return *this;}
+    Expr min(int i) {return dim(i).min();}
+    Expr extent(int i) {return dim(i).extent();}
+    Expr stride(int i) {return dim(i).stride();}
+    // @}
+
+    /** Get the alignment of the host pointer in bytes. Defaults to
+     * the size of type. */
     EXPORT int host_alignment() const;
 
-    /** Set the extent in a given dimension to equal the given
-     * expression. Images passed in that fail this check will generate
-     * a runtime error. Returns a reference to the ImageParam so that
-     * these calls may be chained.
-     *
-     * This may help the compiler generate better
-     * code. E.g:
-     \code
-     im.set_extent(0, 100);
-     \endcode
-     * tells the compiler that dimension zero must be of extent 100,
-     * which may result in simplification of boundary checks. The
-     * value can be an arbitrary expression:
-     \code
-     im.set_extent(0, im.extent(1));
-     \endcode
-     * declares that im is a square image (of unknown size), whereas:
-     \code
-     im.set_extent(0, (im.extent(0)/32)*32);
-     \endcode
-     * tells the compiler that the extent is a multiple of 32. */
-    EXPORT OutputImageParam &set_extent(int dim, Expr extent);
-
-    /** Set the min in a given dimension to equal the given
-     * expression. Setting the mins to zero may simplify some
-     * addressing math. */
-    EXPORT OutputImageParam &set_min(int dim, Expr min);
-
-    /** Set the stride in a given dimension to equal the given
-     * value. This is particularly helpful to set when
-     * vectorizing. Known strides for the vectorized dimension
-     * generate better code. */
-    EXPORT OutputImageParam &set_stride(int dim, Expr stride);
-
-    /** Set the alignment of the host pointer. On some architectures
-     * an unaligned load/store is significantly more expensive in
-     * terms of performance than an aligned load/store. This allows
-     * the user to align external buffers favorably so that halide
-     * can generate aligned loads/stores as appropriate. The alignment
-     * should be a power of 2. */
-    EXPORT OutputImageParam &set_host_alignment(int bytes);
-
-    /** Set the min and extent in one call. */
-    EXPORT OutputImageParam &set_bounds(int dim, Expr min, Expr extent);
+    /** Set the expected alignment of the host pointer in bytes. */
+    EXPORT OutputImageParam &set_host_alignment(int);
 
     /** Get the dimensionality of this image parameter */
     EXPORT int dimensions() const;
diff --git a/src/Pipeline.cpp b/src/Pipeline.cpp
index bbc05fa89511..07f866542415 100644
--- a/src/Pipeline.cpp
+++ b/src/Pipeline.cpp
@@ -261,7 +261,7 @@ void Pipeline::compile_to_static_library(const string &filename_prefix,
     m.compile(outputs);
 }
 
-void Pipeline::compile_to_multitarget_static_library(const std::string &filename_prefix, 
+void Pipeline::compile_to_multitarget_static_library(const std::string &filename_prefix,
                                                      const std::vector<Argument> &args,
                                                      const std::vector<Target> &targets) {
     auto module_producer = [this, &args](const std::string &name, const Target &target) -> Module {
@@ -940,16 +940,14 @@ vector<const void *> Pipeline::prepare_jit_call_arguments(Realization dst, const
         Type type = output_buffer_types[i].type;
         user_assert(dst[i].dimensions() == dims)
             << "Can't realize Func \"" << func.name()
-            << "\" into Buffer \"" << dst[i].name()
-            << "\" because Buffer \"" << dst[i].name()
-            << "\" is " << dst[i].dimensions() << "-dimensional"
-            << ", but Func \"" << func.name()
+            << "\" into Buffer at " << (void *)dst[i].host_ptr()
+            << " because Buffer is " << dst[i].dimensions()
+            << "-dimensional, but Func \"" << func.name()
             << "\" is " << dims << "-dimensional.\n";
         user_assert(dst[i].type() == type)
             << "Can't realize Func \"" << func.name()
-            << "\" into Buffer \"" << dst[i].name()
-            << "\" because Buffer \"" << dst[i].name()
-            << "\" has type " << dst[i].type()
+            << "\" into Buffer at " << (void *)dst[i].host_ptr()
+            << " because Buffer has type " << Type(dst[i].type())
             << ", but Func \"" << func.name()
             << "\" has type " << type << ".\n";
     }
@@ -983,12 +981,10 @@ vector<const void *> Pipeline::prepare_jit_call_arguments(Realization dst, const
     }
 
     // Then the outputs
-    for (Buffer buf : dst.as_vector()) {
-        internal_assert(buf.defined()) << "Can't realize into an undefined Buffer\n";
+    for (const Buffer &buf : dst.as_vector()) {
         arg_values.push_back(buf.raw_buffer());
         const void *ptr = arg_values.back();
-        debug(1) << "JIT output buffer " << buf.name()
-                 << " @ " << ptr << "\n";
+        debug(1) << "JIT output buffer @ " << ptr << "\n";
     }
 
     return arg_values;
@@ -1237,42 +1233,24 @@ void Pipeline::infer_input_bounds(Realization dst) {
                            << buf.min[2] + buf.extent[2] << ","
                            << buf.min[3] + buf.extent[3] << ")\n";
 
-        // Figure out how much memory to allocate for this buffer
-        size_t min_idx = 0, max_idx = 0;
-        for (int d = 0; d < 4; d++) {
-            if (buf.stride[d] > 0) {
-                min_idx += buf.min[d] * buf.stride[d];
-                max_idx += (buf.min[d] + buf.extent[d] - 1) * buf.stride[d];
-            } else {
-                max_idx += buf.min[d] * buf.stride[d];
-                min_idx += (buf.min[d] + buf.extent[d] - 1) * buf.stride[d];
-            }
-        }
-        size_t total_size = (max_idx - min_idx);
-        while (total_size & 0x1f) total_size++;
-
-        // Allocate enough memory with the right dimensionality.
-        Buffer buffer(ia.param.type(), total_size,
-                      buf.extent[1] > 0 ? 1 : 0,
-                      buf.extent[2] > 0 ? 1 : 0,
-                      buf.extent[3] > 0 ? 1 : 0);
-
-        // Rewrite the buffer fields to match the ones returned
-        for (int d = 0; d < 4; d++) {
-            buffer.raw_buffer()->min[d] = buf.min[d];
-            buffer.raw_buffer()->stride[d] = buf.stride[d];
-            buffer.raw_buffer()->extent[d] = buf.extent[d];
-        }
-        ia.param.set_buffer(buffer);
+        Image<void> im(ia.param.type(), buf);
+        im.allocate();
+        ia.param.set_buffer(im);
     }
 }
 
 void Pipeline::infer_input_bounds(int x_size, int y_size, int z_size, int w_size) {
     user_assert(defined()) << "Can't infer input bounds on an undefined Pipeline.\n";
 
+    vector<int> size;
+    if (x_size) size.push_back(x_size);
+    if (y_size) size.push_back(y_size);
+    if (z_size) size.push_back(z_size);
+    if (w_size) size.push_back(w_size);
+
     vector<Buffer> bufs;
     for (Type t : contents->outputs[0].output_types()) {
-        bufs.push_back(Buffer(t, x_size, y_size, z_size, w_size));
+        bufs.push_back(Buffer(t, size));
     }
     Realization r(bufs);
     infer_input_bounds(r);
diff --git a/src/Pipeline.h b/src/Pipeline.h
index 7be4b1879b73..89b25254b5ae 100644
--- a/src/Pipeline.h
+++ b/src/Pipeline.h
@@ -11,7 +11,6 @@
 
 #include "Buffer.h"
 #include "IntrusivePtr.h"
-#include "Image.h"
 #include "JITModule.h"
 #include "Module.h"
 #include "Tuple.h"
@@ -363,8 +362,8 @@ class Pipeline {
     EXPORT void realize(Realization dst, const Target &target = Target());
     EXPORT void realize(Buffer dst, const Target &target = Target());
 
-    template<typename T>
-    NO_INLINE void realize(Image<T> dst, const Target &target = Target()) {
+    template<typename T, int D>
+    NO_INLINE void realize(Image<T, D> dst, const Target &target = Target()) {
         // Images are expected to exist on-host.
         realize(Buffer(dst), target);
         dst.copy_to_host();
@@ -380,6 +379,17 @@ class Pipeline {
     EXPORT void infer_input_bounds(int x_size = 0, int y_size = 0, int z_size = 0, int w_size = 0);
     EXPORT void infer_input_bounds(Realization dst);
     EXPORT void infer_input_bounds(Buffer dst);
+
+    template<typename T, int D>
+    NO_INLINE void infer_input_bounds(Image<T, D> &im) {
+        // It's possible for bounds inference to also manipulate
+        // output buffers if their host pointer is null, so we must
+        // take Images by reference and communicate the bounds query
+        // result by modifying the argument.
+        Buffer b(im);
+        infer_input_bounds(b);
+        im = b.get();
+    }
     // @}
 
     /** Infer the arguments to the Pipeline, sorted into a canonical order:
diff --git a/src/RDom.cpp b/src/RDom.cpp
index 408987ad82b1..f2c1417bb2fc 100644
--- a/src/RDom.cpp
+++ b/src/RDom.cpp
@@ -147,8 +147,8 @@ RDom::RDom(Buffer b) {
     for (int i = 0; i < b.dimensions(); i++) {
         ReductionVariable var = {
             b.name() + "$" + var_names[i],
-            b.min(i),
-            b.extent(i)
+            b.dim(i).min(),
+            b.dim(i).extent()
         };
         vars.push_back(var);
     }
diff --git a/src/RDom.h b/src/RDom.h
index 08b7785a0974..dd2a4fa3b6f6 100644
--- a/src/RDom.h
+++ b/src/RDom.h
@@ -215,6 +215,8 @@ class RDom {
     // @{
     EXPORT RDom(Buffer);
     EXPORT RDom(ImageParam);
+    template<typename T, int D>
+    NO_INLINE RDom(const Image<T, D> &im) : RDom(Buffer(im)) {}
     // @}
 
     /** Construct a reduction domain that wraps an Internal ReductionDomain object. */
diff --git a/src/Tuple.h b/src/Tuple.h
index 7af0adce062f..8749b4e9be27 100644
--- a/src/Tuple.h
+++ b/src/Tuple.h
@@ -89,9 +89,16 @@ class Realization {
         return buffers[0];
     }
 
+    /** Single-element realizations are implicitly castable to Images. */
+    template<typename T, int D>
+    operator Image<T, D>() const {
+        return buffers[0];
+    }
+
     /** Construct a Realization from some Buffers. */
     //@{
-    template<typename ...Args>
+    template<typename ...Args,
+             typename = std::enable_if<Internal::all_are_convertible<Buffer, Args...>::value>>
     Realization(Buffer a, Buffer b, Args&&... args) {
         buffers = std::vector<Buffer>{a, b, std::forward<Args>(args)...};
     }
diff --git a/tools/halide_image.h b/src/runtime/HalideImage.h
similarity index 66%
rename from tools/halide_image.h
rename to src/runtime/HalideImage.h
index 11ca5190a891..35472709dc51 100644
--- a/tools/halide_image.h
+++ b/src/runtime/HalideImage.h
@@ -9,6 +9,7 @@
 #include <memory>
 #include <vector>
 #include <cassert>
+#include <atomic>
 #include <stdint.h>
 #include <string.h>
 
@@ -21,9 +22,7 @@ struct halide_dimension_t {
     int min, extent, stride;
 };
 
-
 namespace Halide {
-namespace Tools {
 
 template<typename Fn>
 void for_each_element(const buffer_t &buf, Fn &&f);
@@ -31,9 +30,12 @@ void for_each_element(const buffer_t &buf, Fn &&f);
 // Forward-declare our Image class
 template<typename T, int D> class Image;
 
-// This template exists so that Image is extensible with custom
-// operator()(Args...) methods.
-template<typename T, int D, typename ...Args> struct ImageAccessor;
+// This declaration exists so that Image is extensible with custom
+// operator()(Args...) methods. Add implementations of it for whatever
+// types you like. Use enable_if if necessary to stop the overloads
+// being ambiguous.
+template<typename Ret, typename T, int D, typename ...Args>
+Ret image_accessor(const Image<T, D> &, Args...);
 
 // A helper to check if a parameter pack is entirely implicitly
 // int-convertible to use with std::enable_if
@@ -65,16 +67,50 @@ struct AllInts<double, Args...> {
     static const bool value = false;
 };
 
-/** A class that wraps buffer_t and adds functionality. Acts as a base
- * class for the typed version below. Templated on the maximum
- * dimensionality it supports. Use it only when the the element type
- * is unknown, or generic. See the comments on the Image class below
- * for more details. */
-template<int D = 4>
-class Buffer {
-    static_assert(D <= 4, "buffer_t supports a maximum of four dimensions");
+/** A struct acting as a header for allocations owned by the Image
+ * class itself. */
+struct AllocationHeader {
+    void (*deallocate_fn)(void *);
+    std::atomic<int> ref_count;
+};
+
+/** A templated Image class that wraps buffer_t and adds
+ * functionality. When using Halide from C++, this is the preferred
+ * way to create input and output buffers. The overhead of using this
+ * class relative to a naked buffer_t is minimal - it uses another
+ * ~100 bytes on the stack, and does no dynamic allocations when using
+ * it to represent existing memory. This overhead will shrink further
+ * in the future once buffer_t is deprecated.
+ *
+ * The template parameter T is the element type, and D is the maximum
+ * number of dimensions. It must be less than or equal to 4 for now.
+ *
+ * The class optionally allocates and owns memory for the image using
+ * a shared pointer allocated with the provided allocator. If they are
+ * null, malloc and free are used.  Any device-side allocation is
+ * considered as owned if and only if the host-side allocation is
+ * owned.
+ *
+ * For accessing the shape and type, this class provides both the
+ * buffer_t interface (extent[i], min[i], and stride[i] arrays, the
+ * elem_size field), and also the interface of the yet-to-come
+ * halide_buffer_t, which will replace buffer_t. This is intended to
+ * allow a gradual transition to halide_buffer_t. New code should
+ * access the shape via dim[i].extent, dim[i].min, dim[i].stride, and
+ * the type via the 'type' field. */
+template<typename T, int D = 4>
+class Image {
+    // Some helpers for checking properties of T
+    static const bool T_is_void = std::is_same<T, void>::value;
+    using not_void_T = typename std::conditional<T_is_void, uint8_t, T>::type;
 
-protected:
+    // Get the Halide type of T. Callers should not use the result if
+    // T is void.
+    static halide_type_t static_halide_type() {
+        return halide_type_of<not_void_T>();
+    }
+
+    static_assert(D <= 4, "buffer_t supports a maximum of four dimensions");
 
     buffer_t buf = {0};
 
@@ -88,7 +124,30 @@ class Buffer {
 
     /** The allocation owned by this Image. NULL if the Image does not
      * own the memory. */
-    std::shared_ptr<uint8_t> alloc;
+    AllocationHeader *alloc = nullptr;
+
+    /** Increment the reference count of any allocation */
+    void incref() {
+        if (alloc) {
+            alloc->ref_count++;
+        }
+    }
+
+    /** Decrement the reference count of any allocation and free host
+     * and device memory if it hits zero. Sets alloc to nullptr. */
+    void decref() {
+        if (alloc) {
+            int result = --(alloc->ref_count);
+            if (result == 0) {
+                if (buf.dev) {
+                    device_free();
+                }
+                void (*fn)(void *) = alloc->deallocate_fn;
+                fn(alloc);
+            }
+            alloc = nullptr;
+        }
+    }
 
     /** A temporary helper function to get the number of dimensions in
      * a buffer_t. Will disappear when halide_buffer_t is merged. */
@@ -125,6 +184,19 @@ class Buffer {
     void initialize_shape(int) {
     }
 
+    /** Initialize the shape from a vector of extents */
+    void initialize_shape(const std::vector<int> &sizes) {
+        for (size_t i = 0; i < sizes.size(); i++) {
+            buf.min[i] = 0;
+            buf.extent[i] = sizes[i];
+            if (i == 0) {
+                buf.stride[i] = 1;
+            } else {
+                buf.stride[i] = buf.stride[i-1] * buf.extent[i-1];
+            }
+        }
+    }
+
     /** Initialize the shape from the static shape of an array */
     template<typename Array, size_t N>
     void initialize_shape_from_array_shape(int next, Array (&vals)[N]) {
@@ -139,30 +211,30 @@ class Buffer {
     }
 
     /** Base case for the template recursion above. */
-    template<typename T>
-    void initialize_shape_from_array_shape(int, const T &) {
+    template<typename T2>
+    void initialize_shape_from_array_shape(int, const T2 &) {
     }
 
     /** Get the dimensionality of a multi-dimensional C array */
     template<typename Array, size_t N>
     static int dimensionality_of_array(Array (&vals)[N]) {
-        return Buffer<D>::dimensionality_of_array(vals[0]) + 1;
+        return dimensionality_of_array(vals[0]) + 1;
     }
 
-    template<typename T>
-    static int dimensionality_of_array(const T &) {
+    template<typename T2>
+    static int dimensionality_of_array(const T2 &) {
         return 0;
     }
 
     /** Get the underlying halide_type_t of an array's element type. */
     template<typename Array, size_t N>
     static halide_type_t scalar_type_of_array(Array (&vals)[N]) {
-        return Buffer<D>::scalar_type_of_array(vals[0]);
+        return scalar_type_of_array(vals[0]);
     }
 
-    template<typename T>
-    static halide_type_t scalar_type_of_array(const T &) {
-        return halide_type_of<typename std::remove_cv<T>::type>();
+    template<typename T2>
+    static halide_type_t scalar_type_of_array(const T2 &) {
+        return halide_type_of<typename std::remove_cv<T2>::type>();
     }
 
     /** Check if any args in a parameter pack are zero */
@@ -176,8 +248,17 @@ class Buffer {
         return false;
     }
 
+    static bool any_zero(const std::vector<int> &v) {
+        for (int i : v) {
+            if (i == 0) return true;
+        }
+        return false;
+    }
+
 public:
 
+    typedef T ElemType;
+
     /** Read-only access to the shape */
     class Dimension {
         const buffer_t &buf;
@@ -218,6 +299,13 @@ class Buffer {
         return Dimension(buf, i);
     }
 
+    /** Access to the mins, strides, extents. Will be deprecated. Do not use. */
+    // @{
+    int min(int i) const { return dim(i).min(); }
+    int extent(int i) const { return dim(i).extent(); }
+    int stride(int i) const { return dim(i).stride(); }
+    // @}
+
     /** The total number of elements this buffer represents. Equal to
      * the product of the extents */
     size_t number_of_elements() const {
@@ -240,18 +328,18 @@ class Buffer {
 
     /** A pointer to the element with the lowest address. If all
      * strides are positive, equal to the host pointer. */
-    uint8_t *begin() const {
+    T *begin() const {
         ptrdiff_t index = 0;
         for (int i = 0; i < dimensions(); i++) {
             if (dim(i).stride() < 0) {
                 index += dim(i).stride() * (dim(i).extent() - 1);
             }
         }
-        return buf.host + index * buf.elem_size;
+        return (T *)(buf.host + index * buf.elem_size);
     }
 
     /** A pointer to one beyond the element with the highest address. */
-    uint8_t *end() const {
+    T *end() const {
         ptrdiff_t index = 0;
         for (int i = 0; i < dimensions(); i++) {
             if (dim(i).stride() > 0) {
@@ -259,87 +347,167 @@ class Buffer {
             }
         }
         index += 1;
-        return buf.host + index * buf.elem_size;
+        return (T *)(buf.host + index * buf.elem_size);
     }
 
     /** The total number of bytes spanned by the data in memory. */
     size_t size_in_bytes() const {
-        return (size_t)(end() - begin());
+        return (size_t)((uint8_t *)end() - (uint8_t *)begin());
     }
 
-    Buffer() {}
+    Image() {}
 
     /** Make a buffer from a buffer_t */
-    Buffer(const buffer_t &buf) {
+    Image(const buffer_t &buf) : ty(static_halide_type()) {
+        static_assert(!T_is_void, "Can't construct an Image<void> from a buffer_t. Type is unknown.");
         initialize_from_buffer(buf);
     }
 
-    /** Give Buffers access to the members of Buffers of different dimensionalities. */
-    template<int D2> friend class Buffer;
+    Image(halide_type_t t, const buffer_t &buf) : ty(t) {
+        initialize_from_buffer(buf);
+    }
 
-    /** Make a Buffer from another Buffer of possibly-different
-     * dimensionality. Asserts if D is less than the dimensionality of
-     * the argument. */
-    template<int D2>
-    Buffer(const Buffer<D2> &other) : buf(other.buf),
-                                      dims(other.dims),
-                                      ty(other.ty),
-                                      alloc(other.alloc) {
+    /** Give Images access to the members of Images of different dimensionalities and types. */
+    template<typename T2, int D2> friend class Image;
+
+    /** Fail an assertion at runtime or compile-time if an Image<T, D>
+     * cannot be constructed from some other Image type. */
+    template<typename T2, int D2>
+    void assert_can_convert_from(const Image<T2, D2> &other) {
+        static_assert((std::is_same<typename std::remove_const<T>::type, T2>::value ||
+                       T_is_void ||
+                       std::is_same<T2, void>::value),
+                      "type mismatch constructing Image");
         if (D < D2) {
             assert(other.dimensions() <= D);
         }
+        if (std::is_same<T2, void>::value && !T_is_void) {
+            assert(other.ty == static_halide_type());
+        }
+    }
+
+    /** Make a Image<T> from another Image<T> of possibly-different
+     * dimensionality and type. Asserts if D is less than the
+     * dimensionality of the argument, or if there's a type
+     * mismatch. */
+    template<typename T2, int D2>
+    Image(const Image<T2, D2> &other) : buf(other.buf),
+                                        dims(other.dims),
+                                        ty(other.ty),
+                                        alloc(other.alloc) {
+        incref();
+        assert_can_convert_from(other);
+    }
+
+    Image(const Image<T, D> &other) : buf(other.buf),
+                                      dims(other.dims),
+                                      ty(other.ty),
+                                      alloc(other.alloc) {
+        incref();
     }
 
-    /** Move-construct a Buffer from another Buffer of
+    /** Move-construct an Image from another Image of
      * possibly-different dimensionality. Asserts if D is less than
-     * the dimensionality of the argument. */
-    template<int D2>
-    Buffer(const Buffer<D2> &&other) : buf(other.buf),
-                                       dims(other.dims),
-                                       ty(other.ty),
-                                       alloc(std::move(other.alloc)) {
-        if (D < D2) {
-            assert(other.dimensions() <= D);
-        }
+     * the dimensionality of the argument, or if there's a type
+     * mismatch. */
+    template<typename T2, int D2>
+    Image(Image<T2, D2> &&other) : buf(other.buf),
+                                   dims(other.dims),
+                                   ty(other.ty),
+                                   alloc(other.alloc) {
+        other.alloc = nullptr;
+        assert_can_convert_from(other);
     }
 
+    Image(Image<T, D> &&other) : buf(other.buf),
+                                   dims(other.dims),
+                                   ty(other.ty),
+                                   alloc(other.alloc) {
+        other.alloc = nullptr;
+    }
 
-    /** Assign from another Buffer of possibly-different
-     * dimensionality. Asserts if D is less than the dimensionality of
-     * the argument. */
-    template<int D2>
-    Buffer<D> &operator=(const Buffer<D2> &other) {
-        if (D < D2) {
-            assert(other.dimensions() <= D);
+    /** Assign from another Image of possibly-different dimensionality
+     * and type. Asserts if D is less than the dimensionality of the
+     * argument, or if there's a type mismatch. */
+    template<typename T2, int D2>
+    Image<T, D> &operator=(const Image<T2, D2> &other) {
+        assert_can_convert_from(other);
+        buf = other.buf;
+        ty = other.ty;
+        dims = other.dims;
+        if (alloc != other.alloc) {
+            // Drop existing allocation
+            decref();
+            // Share other allocation
+            alloc = other.alloc;
+            incref();
         }
+        return *this;
+    }
+
+    Image<T, D> &operator=(const Image<T, D> &other) {
         buf = other.buf;
         ty = other.ty;
         dims = other.dims;
-        alloc = other.alloc;
+        if (alloc != other.alloc) {
+            // Drop existing allocation
+            decref();
+            // Share other allocation
+            alloc = other.alloc;
+            incref();
+        }
         return *this;
     }
 
-    /** Move from another Buffer of possibly-different
-     * dimensionality. Asserts if D is less than the dimensionality of
-     * the argument. */
-    template<int D2>
-    Buffer<D> &operator=(const Buffer<D2> &&other) {
-        if (D < D2) {
-            assert(other.dimensions() <= D);
+    /** Move from another Image of possibly-different dimensionality
+     * and type. Asserts if D is less than the dimensionality of the
+     * argument, or if there's a type mismatch. */
+    template<typename T2, int D2>
+    Image<T, D> &operator=(Image<T2, D2> &&other) {
+        assert_can_convert_from(other);
+        buf = other.buf;
+        ty = other.ty;
+        dims = other.dims;
+        if (alloc != other.alloc) {
+            // Drop existing allocation
+            decref();
+            // Steal other allocation
+            alloc = other.alloc;
+            other.alloc = nullptr;
         }
+        return *this;
+    }
+
+    Image<T, D> &operator=(Image<T, D> &&other) {
         buf = other.buf;
         ty = other.ty;
         dims = other.dims;
-        alloc = std::move(other.alloc);
+        if (alloc != other.alloc) {
+            // Drop existing allocation
+            decref();
+            // Steal other allocation
+            alloc = other.alloc;
+            other.alloc = nullptr;
+        }
         return *this;
     }
 
+    /** Check the product of the extents fits in memory. */
+    void check_overflow() {
+        size_t size = 1;
+        for (int i = 0; i < dimensions(); i++) {
+            size *= dim(i).extent();
+        }
+        for (int i = 0; i < dimensions(); i++) {
+            size /= dim(i).extent();
+        }
+        assert(size == 1 && "Error: Overflow computing total size of buffer.");
+    }
+
     /** Allocate memory for this Image. Drops the reference to any
      * existing memory. */
     void allocate(void *(*allocate_fn)(size_t) = nullptr,
                   void (*deallocate_fn)(void *) = nullptr) {
-        assert(buf.dev == 0);
-
         if (!allocate_fn) {
             allocate_fn = malloc;
         }
@@ -347,35 +515,80 @@ class Buffer {
             deallocate_fn = free;
         }
 
+        // Drop any existing allocation
+        decref();
+
         // Conservatively align images to 128 bytes. This is enough
         // alignment for all the platforms we might use.
         size_t size = size_in_bytes();
         const size_t alignment = 128;
         size = (size + alignment - 1) & ~(alignment - 1);
-        uint8_t *ptr;
-        ptr = (uint8_t *)allocate_fn(size + alignment - 1);
-        alloc.reset(ptr, deallocate_fn);
-        buf.host = (uint8_t *)((uintptr_t)(ptr + alignment - 1) & ~(alignment - 1));
+        alloc = (AllocationHeader *)allocate_fn(size + sizeof(AllocationHeader) + alignment - 1);
+        alloc->deallocate_fn = deallocate_fn;
+        alloc->ref_count = 1;
+        uint8_t *unaligned_ptr = ((uint8_t *)alloc) + sizeof(AllocationHeader);
+        buf.host = (uint8_t *)((uintptr_t)(unaligned_ptr + alignment - 1) & ~(alignment - 1));
+    }
+
+    /** Allocate a new image of the given size with a runtime
+     * type. Only used when you do know what size you want but you
+     * don't know statically what type the elements are. Pass zeroes
+     * to make a buffer suitable for bounds query calls. */
+    template<typename ...Args>
+    Image(halide_type_t t, int first, Args&&... rest) : ty(t) {
+        if (!T_is_void) {
+            assert(static_halide_type() == t);
+        }
+        static_assert(sizeof...(rest) < D,
+                      "Too many arguments to constructor. Use Image<T, D>, "
+                      "where D is at least the desired number of dimensions");
+        initialize_shape(0, first, int(rest)...);
+        buf.elem_size = ty.bytes();
+        dims = 1 + (int)(sizeof...(rest));
+        if (!any_zero(first, int(rest)...)) {
+            check_overflow();
+            allocate();
+        }
     }
 
-   /** Allocate a new image of the given size. Pass zeroes to make a
+
+    /** Allocate a new image of the given size. Pass zeroes to make a
      * buffer suitable for bounds query calls. */
     template<typename ...Args>
-    Buffer(halide_type_t t, int first, Args&&... rest) : ty(t) {
+    Image(int first, Args&&... rest) : ty(static_halide_type()) {
+        static_assert(!T_is_void,
+                      "To construct an Image<void>, pass a halide_type_t as the first argument to the constructor");
         static_assert(sizeof...(rest) < D,
-                      "Too many arguments to constructor. Use Image<T, D>, where D is at least the desired number of dimensions");
+                      "Too many arguments to constructor. Use Image<T, D>, "
+                      "where D is at least the desired number of dimensions");
         initialize_shape(0, first, int(rest)...);
         buf.elem_size = ty.bytes();
         dims = 1 + (int)(sizeof...(rest));
         if (!any_zero(first, int(rest)...)) {
+            check_overflow();
+            allocate();
+        }
+    }
+
+    /** Allocate a new image of unknown type using a vector of ints as the size. */
+    Image(halide_type_t t, const std::vector<int> &sizes) : ty(t) {
+        if (!T_is_void) {
+            assert(static_halide_type() == t);
+        }
+        assert(sizes.size() <= D);
+        initialize_shape(sizes);
+        buf.elem_size = ty.bytes();
+        dims = (int)sizes.size();
+        if (!any_zero(sizes)) {
+            check_overflow();
             allocate();
         }
     }
 
-    /** Make a Buffer that refers to a statically sized array. Does not
+    /** Make an Image that refers to a statically sized array. Does not
      * take ownership of the data. */
     template<typename Array, size_t N>
-    explicit Buffer(Array (&vals)[N]) {
+    explicit Image(Array (&vals)[N]) {
         dims = dimensionality_of_array(vals);
         initialize_shape_from_array_shape(dims - 1, vals);
         ty = scalar_type_of_array(vals);
@@ -383,13 +596,32 @@ class Buffer {
         buf.host = (uint8_t *)vals;
     }
 
-    /** Initialize a Buffer from a pointer and some sizes. Assumes
+    /** Initialize an Image of runtime type from a pointer and some
+     * sizes. Assumes dense row-major packing and a min coordinate of
+     * zero. Does not take ownership of the data. */
+    template<typename ...Args>
+    explicit Image(halide_type_t t, void *data, int first, Args&&... rest) {
+        if (!T_is_void) {
+            assert(static_halide_type() == t);
+        }
+        static_assert(sizeof...(rest) < D,
+                      "Too many arguments to constructor. Use Image<T, D>, "
+                      "where D is at least the desired number of dimensions");
+        ty = t;
+        initialize_shape(0, first, int(rest)...);
+        buf.elem_size = ty.bytes();
+        dims = 1 + (int)(sizeof...(rest));
+        buf.host = (uint8_t *)data;
+    }
+
+    /** Initialize an Image from a pointer and some sizes. Assumes
      * dense row-major packing and a min coordinate of zero. Does not
      * take ownership of the data. */
-    template<typename T, typename ...Args>
-    explicit Buffer(T *data, int first, Args&&... rest) {
+    template<typename ...Args>
+    explicit Image(T *data, int first, Args&&... rest) {
         static_assert(sizeof...(rest) < D,
-                      "Too many arguments to constructor. Use Image<T, D>, where D is at least the desired number of dimensions");
+                      "Too many arguments to constructor. Use Image<T, D>, "
+                      "where D is at least the desired number of dimensions");
         ty = halide_type_of<typename std::remove_cv<T>::type>();
         initialize_shape(0, first, int(rest)...);
         buf.elem_size = sizeof(T);
@@ -400,8 +632,27 @@ class Buffer {
     /** Initialize an Image from a pointer to the min coordinate and
      * an array describing the shape.  Does not take ownership of the
      * data. */
-    template<typename T, int N, typename std::enable_if<N < D>::type>
-    explicit Buffer(T *data, halide_dimension_t shape[N]) {
+    template<int N, typename std::enable_if<N < D>::type>
+    explicit Image(halide_type_t t, void *data, halide_dimension_t shape[N]) {
+        if (!T_is_void) {
+            assert(static_halide_type() == t);
+        }
+        ty = t;
+        dims = N;
+        for (int i = 0; i < N; i++) {
+            buf.min[i]    = shape[i].min;
+            buf.extent[i] = shape[i].extent;
+            buf.stride[i] = shape[i].stride;
+        }
+        buf.elem_size = ty.bytes();
+        buf.host = (uint8_t *)data;
+    }
+
+    /** Initialize an Image from a pointer to the min coordinate and
+     * an array describing the shape.  Does not take ownership of the
+     * data. */
+    template<int N, typename std::enable_if<N < D>::type>
+    explicit Image(T *data, halide_dimension_t shape[N]) {
         ty = halide_type_of<typename std::remove_cv<T>::type>();
         dims = N;
         for (int i = 0; i < N; i++) {
@@ -413,8 +664,10 @@ class Buffer {
         buf.host = (uint8_t *)data;
     }
 
-    template<typename T>
-    explicit Buffer(T *data, halide_dimension_t shape[D]) {
+    explicit Image(halide_type_t t, void *data, halide_dimension_t shape[D]) {
+        if (!T_is_void) {
+            assert(static_halide_type() == t);
+        }
         ty = halide_type_of<typename std::remove_cv<T>::type>();
         dims = 0;
         for (int i = 0; i < D; i++) {
@@ -428,18 +681,22 @@ class Buffer {
         buf.host = (uint8_t *)data;
     }
 
-    /** If you use the (x, y, c) indexing convention, then Halide
-     * Images are stored planar by default. This function constructs
-     * an interleaved RGB or RGBA image that can still be indexed
-     * using (x, y, c). Passing it to a generator requires that the
-     * generator has been compiled with support for interleaved (also
-     * known as packed or chunky) memory layouts. */
-    static Buffer<D> make_interleaved(halide_type_t t, int width, int height, int channels) {
-        static_assert(D >= 3, "Not enough dimensions to make an interleaved image");
-        Buffer<D> im(t, channels, width, height);
-        im.transpose(0, 1);
-        im.transpose(1, 2);
-        return im;
+    explicit Image(T *data, halide_dimension_t shape[D]) {
+        ty = halide_type_of<typename std::remove_cv<T>::type>();
+        dims = 0;
+        for (int i = 0; i < D; i++) {
+            if (!shape[i].extent) break;
+            dims++;
+            buf.min[i]    = shape[i].min;
+            buf.extent[i] = shape[i].extent;
+            buf.stride[i] = shape[i].stride;
+        }
+        buf.elem_size = sizeof(T);
+        buf.host = (uint8_t *)data;
+    }
+
+    ~Image() {
+        decref();
     }
 
     /** Get a pointer to the raw buffer_t this wraps. */
@@ -453,6 +710,16 @@ class Buffer {
     }
     // @}
 
+    /** Access to the untyped host pointer */
+    // @{
+    const void *host_ptr() const {
+        return buf.host;
+    }
+    void *host_ptr() {
+        return buf.host;
+    }
+    // @}
+
     /** Provide a cast operator to buffer_t *, so that instances can
      * be passed directly to Halide filters. */
     operator buffer_t *() {
@@ -476,9 +743,9 @@ class Buffer {
      * or slice followed by copy to make a copy of only a portion of
      * the image. The new image uses the same memory layout as the
      * original, with holes compacted away. */
-    Buffer<D> copy(void *(*allocate_fn)(size_t) = nullptr,
-                   void (*deallocate_fn)(void *) = nullptr) const {
-        Buffer<D> src = *this;
+    Image<T, D> copy(void *(*allocate_fn)(size_t) = nullptr,
+                     void (*deallocate_fn)(void *) = nullptr) const {
+        Image<T, D> src = *this;
 
         // Reorder the dimensions of src to have strides in increasing order
         int swaps[(D*(D+1))/2];
@@ -493,12 +760,12 @@ class Buffer {
         }
 
         // Make a copy of it using this dimension ordering
-        Buffer<D> dst = src;
+        Image<T, D> dst = src;
         dst.allocate(allocate_fn, deallocate_fn);
 
         // Concatenate dense inner dimensions into contiguous memcpy tasks
-        Buffer<D> src_slice = src;
-        Buffer<D> dst_slice = dst;
+        Image<T, D> src_slice = src;
+        Image<T, D> dst_slice = dst;
         int64_t slice_size = 1;
         while (src_slice.dimensions && src_slice.dim(0).stride() == slice_size) {
             assert(dst_slice.dim(0).stride() == slice_size);
@@ -526,10 +793,10 @@ class Buffer {
      * the given dimension. Does not assert the crop region is within
      * the existing bounds. The cropped image drops any device
      * handle. */
-    Buffer<D> cropped(int d, int min, int extent) const {
+    Image<T, D> cropped(int d, int min, int extent) const {
         // Make a fresh copy of the underlying buffer (but not a fresh
         // copy of the allocation, if there is one).
-        Buffer<D> im = *this;
+        Image<T, D> im = *this;
         // Drop the reference to any device allocation. It won't be
         // valid for the cropped image.
         im.buf.dev = 0;
@@ -551,10 +818,10 @@ class Buffer {
     /** Make an image that refers to a sub-rectangle of this image along
      * the first N dimensions. Does not assert the crop region is within
      * the existing bounds. The cropped image drops any device handle. */
-    Buffer<D> cropped(const std::vector<std::pair<int, int>> &rect) const {
+    Image<T, D> cropped(const std::vector<std::pair<int, int>> &rect) const {
         // Make a fresh copy of the underlying buffer (but not a fresh
         // copy of the allocation, if there is one).
-        Buffer<D> im = *this;
+        Image<T, D> im = *this;
         // Drop the reference to any device allocation. It won't be
         // valid for the cropped image.
         im.buf.dev = 0;
@@ -573,8 +840,8 @@ class Buffer {
      * translated coordinates in the given dimension. Positive values
      * move the image data to the right or down relative to the
      * coordinate system. Drops any device handle. */
-    Buffer<D> translated(int d, int dx) const {
-        Buffer<D> im = *this;
+    Image<T, D> translated(int d, int dx) const {
+        Image<T, D> im = *this;
         im.buf.dev = 0;
         im.translate(d, dx);
         return im;
@@ -588,7 +855,7 @@ class Buffer {
     /** Make an image which refers to the same data translated along
      * the first N dimensions. */
     void translated(const std::vector<int> &delta) {
-        Buffer<D> im = *this;
+        Image<T, D> im = *this;
         im.buf.dev = 0;
         im.translate(delta);
         return im;
@@ -601,10 +868,21 @@ class Buffer {
         }
     }
 
+    /** Set the min coordinate of an image in the first N dimensions */
+    template<typename ...Args>
+    void set_min(Args... args) {
+        static_assert(sizeof...(args) <= D, "Too many arguments for dimensionality of Image");
+        assert(sizeof...(args) <= (size_t)dimensions());
+        const int x[] = {args...};
+        for (size_t i = 0; i < sizeof...(args); i++) {
+            buf.min[i] = x[i];
+        }
+    }
+
     /** Make an image which refers to the same data using a different
      * ordering of the dimensions. */
-    Buffer<D> transposed(int d1, int d2) const {
-        Buffer<D> im = *this;
+    Image<T, D> transposed(int d1, int d2) const {
+        Image<T, D> im = *this;
         im.transpose(d1, d2);
         return im;
     }
@@ -618,11 +896,11 @@ class Buffer {
 
     /** Make a lower-dimensional image that refers to one slice of this
      * image. Drops any device handle. */
-    Buffer<D-1> sliced(int d, int pos) const {
-        Buffer<D> im = *this;
+    Image<T, D-1> sliced(int d, int pos) const {
+        Image<T, D> im = *this;
         im.buf.dev = 0;
         im.slice(d, pos);
-        return Buffer<D-1>(std::move(im));
+        return Image<T, D-1>(std::move(im));
     }
 
     /** Slice an image in-place */
@@ -651,9 +929,9 @@ class Buffer {
      &im(x, y, c) == &im2(x, 17, y, c);
      \endcode
      */
-    Buffer<D+1> embedded(int d, int pos) const {
+    Image<T, D+1> embedded(int d, int pos) const {
         assert(d >= 0 && d <= dimensions());
-        Buffer<D+1> im(*this);
+        Image<T, D+1> im(*this);
         im.buf.dev = 0;
         im.add_dimension();
         im.translate(im.dimensions() - 1, pos);
@@ -696,7 +974,7 @@ class Buffer {
      * for_each_element below for more details. */
     template<typename Fn>
     void for_each_element(Fn f) const {
-        Halide::Tools::for_each_element(buf, f);
+        Halide::for_each_element(buf, f);
     }
 
     /** Methods for managing any GPU allocation. */
@@ -717,157 +995,39 @@ class Buffer {
         buf.dev_dirty = v;
     }
 
-    void copy_to_host() {
+    void copy_to_host(void *ctx = nullptr) {
         if (device_dirty()) {
-            halide_copy_to_host(NULL, &buf);
+            halide_copy_to_host(ctx, &buf);
         }
     }
 
-    void copy_to_device(const struct halide_device_interface *device_interface) {
+    void copy_to_device(const struct halide_device_interface *device_interface, void *ctx = nullptr) {
         if (host_dirty()) {
-            halide_copy_to_device(NULL, &buf, device_interface);
+            halide_copy_to_device(ctx, &buf, device_interface);
         }
     }
 
-    void device_free() {
-        halide_device_free(nullptr, &buf);
-    }
-    // @}
-};
-
-/** A templated Image class that wraps buffer_t and adds
- * functionality. When using Halide from C++, this is the preferred
- * way to create input and output buffers. The overhead of using this
- * class relative to a naked buffer_t is minimal - it uses another
- * ~100 bytes on the stack, and does no dynamic allocations when using
- * it to represent existing memory. This overhead will shrink further
- * in the future once buffer_t is deprecated.
- *
- * The template parameter T is the element type, and D is the maximum
- * number of dimensions. It must be less than or equal to 4 for now.
- *
- * The class optionally allocates and owns memory for the image using
- * a std::shared_ptr allocated with the provided allocator. If they
- * are null, malloc and free are used.  Any device-side allocation is
- * not owned, and must be freed manually using device_free.
- *
- * For accessing the shape and type, this class provides both the
- * buffer_t interface (extent[i], min[i], and stride[i] arrays, the
- * elem_size field), and also the interface of the yet-to-come
- * halide_buffer_t, which will replace buffer_t. This is intended to
- * allow a gradual transition to halide_buffer_t. New code should
- * access the shape via dim[i].extent, dim[i].min, dim[i].stride, and
- * the type via the 'type' field. */
-template<typename T, int D = 4>
-class Image : public Buffer<D> {
-    static_assert(D <= 4, "buffer_t supports a maximum of four dimensions");
-
-public:
-    typedef T ElemType;
-
-    /** Get the type of the elements. Overridden here because we
-     * statically know the type. */
-    halide_type_t type() const {
-        return halide_type_of<std::remove_cv<T>::type>();
-    }
-
-    Image() {}
-
-    Image(const buffer_t &buf) : Buffer<D>(buf) {}
-
-    /** Allocate a new image of the given size. Pass zeroes to make a
-     * buffer suitable for bounds query calls. */
-    template<typename ...Args>
-    Image(int first, Args&&... rest) :
-        Buffer<D>(halide_type_of<typename std::remove_cv<T>::type>(), first, int(rest)...) {}
-
-    /** Make an image that refers to a statically sized array. Does not
-     * take ownership of the data. */
-    template<typename Array, size_t N>
-    explicit Image(Array (&vals)[N]) :
-        Buffer<D>(vals) {}
-
-    /** Initialize an Image from a pointer and some sizes. Assumes
-     * dense row-major packing and a min coordinate of zero. Does not
-     * take ownership of the data. */
-    template<typename ...Args>
-    explicit Image(T *data, int first, Args&&... rest) :
-        Buffer<D>(data, first, int(rest)...) {}
-
-    /** Initialize an Image from a pointer to the min coordinate and
-     * an array describing the shape.  Does not take ownership of the
-     * data. */
-    template<int N, typename std::enable_if<N < D>::type>
-    explicit Image(T *data, halide_dimension_t shape[N]) : Buffer<D>(data, shape) {}
-
-    /** Initialize an Image from a pointer to the min coordinate and
-     * an array describing the shape.  Does not take ownership of the
-     * data. This version exists so that there's a non-templated
-     * version to use in case the Image is a derived type and so N
-     * can't be inferred in the version above. */
-    explicit Image(T *data, halide_dimension_t shape[D]) :
-        Buffer<D>(data, shape) {}
-
-    /** Construct a typed Image from an untyped Buffer. Asserts at
-     * runtime if there's a type mismatch, or if the dimensionality of
-     * the buffer is less than D. */
-    template<int D2>
-    Image(const Buffer<D2> &buf) : Buffer<D>(buf) {
-        assert(halide_type_of<typename std::remove_cv<T>::type>() == buf.type());
-    }
-
-    /** Move-construct a typed Image from an untyped Buffer. Asserts
-     * at runtime if there's a type mismatch, or if the dimensionality
-     * of the buffer is less than D. */
-    template<int D2>
-    Image(const Buffer<D2> &&buf) : Buffer<D>(buf) {
-        assert(halide_type_of<typename std::remove_cv<T>::type>() == buf.type());
-    }
-
-    /** Construct an Image from an Image of a different
-     * dimensionality. Asserts at runtime the other dimensionality is
-     * greater than D. Asserts at compile-time if the element type
-     * doesn't match. This constructor is templated on the element
-     * type of the argument so that the Buffer constructor above is
-     * not used for Images with mismatched types.
-     */
-    template<typename T2, int D2>
-    Image(const Image<T2, D2> &buf) : Buffer<D>(buf) {
-        static_assert(std::is_same<typename std::remove_cv<T>::type, T2>::value,
-                      "Can't construct an Image from an Image of different element type, "
-                      "with the exception of casting an Image<T> to an Image<const T>.");
-    }
-
-    /** Move-construct an Image from an Image of a different
-     * dimensionality. Asserts at runtime the other dimensionality is
-     * greater than D. Asserts at compile-time if the element type
-     * doesn't match.
-     */
-    template<typename T2, int D2>
-    Image(const Image<T2, D2> &&buf) : Buffer<D>(buf) {
-        static_assert(std::is_same<typename std::remove_cv<T>::type, T2>::value,
-                      "Can't construct an Image from an Image of different element type, "
-                      "with the exception of casting an Image<T> to an Image<const T>.");
+    void device_free(void *ctx = nullptr) {
+        halide_device_free(ctx, &buf);
     }
 
-    /** Assign an Image from an Image of a different
-     * dimensionality. Asserts at runtime the other dimensionality is
-     * greater than D.
-     */
-    template<int D2>
-    Image<T, D> &operator=(const Image<T, D2> &other) {
-        Buffer<D>::operator=(other);
-        return *this;
+    void device_sync(void *ctx = nullptr) {
+        halide_device_sync(ctx, &buf);
     }
+    // @}
 
-    /** Move-assign an Image from an Image of a different
-     * dimensionality. Asserts at runtime the other dimensionality is
-     * greater than D.
-     */
-    template<int D2>
-    Image<T, D> &operator=(const Image<T, D2> &&other) {
-        Buffer<D>::operator=(other);
-        return *this;
+    /** If you use the (x, y, c) indexing convention, then Halide
+     * Images are stored planar by default. This function constructs
+     * an interleaved RGB or RGBA image that can still be indexed
+     * using (x, y, c). Passing it to a generator requires that the
+     * generator has been compiled with support for interleaved (also
+     * known as packed or chunky) memory layouts. */
+    static Image<void, D> make_interleaved(halide_type_t t, int width, int height, int channels) {
+        static_assert(D >= 3, "Not enough dimensions to make an interleaved image");
+        Image<void, D> im(t, channels, width, height);
+        im.transpose(0, 1);
+        im.transpose(1, 2);
+        return im;
     }
 
     /** If you use the (x, y, c) indexing convention, then Halide
@@ -884,12 +1044,36 @@ class Image : public Buffer<D> {
         return im;
     }
 
+    /** Wrap an existing interleaved image. */
+    static Image<void, D> make_interleaved(halide_type_t t, T *data, int width, int height, int channels) {
+        static_assert(D >= 3, "Not enough dimensions to make an interleaved image");
+        Image<void, D> im(t, data, channels, width, height);
+        im.transpose(0, 1);
+        im.transpose(1, 2);
+        return im;
+    }
+
+    /** Wrap an existing interleaved image. */
+    static Image<T, D> make_interleaved(T *data, int width, int height, int channels) {
+        static_assert(D >= 3, "Not enough dimensions to make an interleaved image");
+        Image<T, D> im(data, channels, width, height);
+        im.transpose(0, 1);
+        im.transpose(1, 2);
+        return im;
+    }
+
+    /** Make a zero-dimensional Image */
+    static Image<void, D> make_scalar(halide_type_t t) {
+        return Image<void, 1>(t, 1).sliced(0, 0);
+    }
+
     /** Make a zero-dimensional Image */
     static Image<T, D> make_scalar() {
         return Image<T, 1>(1).sliced(0, 0);
     }
 
 private:
+
     template<typename ...Args>
      __attribute__((always_inline))
     T *address_of(int d, int first, Args... rest) const {
@@ -931,57 +1115,72 @@ class Image : public Buffer<D> {
     //@{
     template<typename ...Args>
     __attribute__((always_inline))
-    typename std::enable_if<AllInts<Args...>::value, const T &>::type
+    typename std::enable_if<AllInts<Args...>::value, const not_void_T &>::type
     operator()(int first, Args... rest) const {
-        return *(address_of(0, first, rest...));
+        static_assert(!T_is_void,
+                      "Cannot use operator() on Image<void> types");
+        return *((const not_void_T *)(address_of(0, first, rest...)));
     }
 
     __attribute__((always_inline))
-    const T &operator()() const {
-        return *(address_of(0));
+    const not_void_T &
+    operator()() const {
+        static_assert(!T_is_void,
+                      "Cannot use operator() on Image<void> types");
+        return *((const not_void_T *)(data()));
     }
 
     __attribute__((always_inline))
-    const T &operator()(const int *pos) const {
-        return *((T *)address_of(pos));
+    const not_void_T &
+    operator()(const int *pos) const {
+        static_assert(!T_is_void,
+                      "Cannot use operator() on Image<void> types");
+        return *((const not_void_T *)(address_of(pos)));
     }
 
     template<typename ...Args>
     __attribute__((always_inline))
-    typename std::enable_if<AllInts<Args...>::value, T &>::type
+    typename std::enable_if<AllInts<Args...>::value, not_void_T &>::type
     operator()(int first, Args... rest) {
-        return *(address_of(0, first, rest...));
+        static_assert(!T_is_void,
+                      "Cannot use operator() on Image<void> types");
+        return *((not_void_T *)(address_of(0, first, rest...)));
     }
 
     __attribute__((always_inline))
-    T &operator()() {
-        return *(address_of(0));
+    not_void_T &
+    operator()() {
+        static_assert(!T_is_void,
+                      "Cannot use operator() on Image<void> types");
+        return *((not_void_T *)(data()));
     }
 
     __attribute__((always_inline))
-    T &operator()(const int *pos) {
-        return *((T *)address_of(pos));
+    not_void_T &
+    operator()(const int *pos) {
+        static_assert(!T_is_void,
+                      "Cannot use operator() on Image<void> types");
+        return *((not_void_T *)(address_of(pos)));
     }
     // @}
 
     /** Other calls to operator()(Args...) get redirected to a call to
-     * ImageAccessor<T, D, Args...>::operator(const Image<T, D> &,
-     * Args...).  This makes it possible for later code to add new
-     * Image access methods for types not convertible to int
-     * (e.g. Exprs). To add a custom accessor, define a template
-     * specialization of ImageAccessor with an operator() method that
-     * takes the expected arguments. See
+     * image_accessor(const Image<T, D> &, Args...). This makes it
+     * possible for later code to add new Image access methods for
+     * types not convertible to int (e.g. Exprs). To add a custom
+     * accessor, define an overload of image_accessor that takes the
+     * expected arguments. See
      * test/correctness/custom_image_accessor.cpp for an example. */
     template<typename ...Args>
     auto operator()(Args... args) const ->
-        decltype(ImageAccessor<T, D, Args...>()(*this, args...)) {
-        return ImageAccessor<T, D, Args...>()(*this, args...);
+        decltype(image_accessor(*this, args...)) {
+        return image_accessor(*this, args...);
     }
 
     template<typename ...Args>
     auto operator()(Args... args) ->
-        decltype(ImageAccessor<T, D, Args...>()(*this, args...)) {
-        return ImageAccessor<T, D, Args...>()(*this, args...);
+        decltype(image_accessor(*this, args...)) {
+        return image_accessor(*this, args...);
     }
 
 private:
@@ -989,9 +1188,9 @@ class Image : public Buffer<D> {
     // lambda of the correct dimensionality.
     template<typename ...Args>
     typename std::enable_if<(sizeof...(Args) < D)>::type
-    fill_helper(T val, Args... args) {
-        if (sizeof...(Args) == Buffer<D>::dimensions()) {
-            Buffer<D>::for_each_element([&](Args... args) {(*this)(args...) = val;});
+    fill_helper(not_void_T val, Args... args) {
+        if (sizeof...(Args) == dimensions()) {
+            for_each_element([&](Args... args) {(*this)(args...) = val;});
         } else {
             fill_helper(val, 0, args...);
         }
@@ -999,74 +1198,18 @@ class Image : public Buffer<D> {
 
     template<typename ...Args>
     typename std::enable_if<(sizeof...(Args) == D)>::type
-    fill_helper(T val, Args...) {
-        Buffer<D>::for_each_element([&](Args... args) {(*this)(args...) = val;});
+    fill_helper(not_void_T val, Args...) {
+        for_each_element([&](Args... args) {(*this)(args...) = val;});
     }
 
 public:
 
     /** Set every value in the buffer to the given value */
-    void fill(T val) {
+    template<typename = std::enable_if<(!T_is_void)>>
+    void fill(not_void_T val) {
         fill_helper(val);
     }
 
-    /** Make a new image which is a deep copy of this image. Use crop
-     * or slice followed by copy to make a copy of only a portion of
-     * the image. The new image uses the same memory layout as the
-     * original, with holes compacted away. */
-    Image<T, D> copy(void *(*allocate_fn)(size_t) = nullptr,
-                     void (*deallocate_fn)(void *) = nullptr) const {
-        return Image<T, D>(Buffer<D>::copy(allocate_fn, deallocate_fn));
-    }
-
-    /** Make an image that refers to a sub-range of this image along
-     * the given dimension. Does not assert the crop region is within
-     * the existing bounds. */
-    Image<T, D> cropped(int d, int min, int extent) const {
-        return Image<T, D>(Buffer<D>::cropped(d, min, extent));
-    }
-
-    /** Make an image that refers to a sub-rectangle of this image along
-     * the first N dimensions. Does not assert the crop region is within
-     * the existing bounds. The cropped image drops any device handle. */
-    Image<T, D> cropped(const std::vector<std::pair<int, int>> &rect) const {
-        return Image<T, D>(Buffer<D>::cropped(rect));
-    }
-
-    /** Make an image which refers to the same data with using
-     * translated coordinates in the given dimension. Positive values
-     * move the image data to the right or down relative to the
-     * coordinate system. */
-    Image<T, D> translated(int d, int dx) const {
-        return Image<T, D>(Buffer<D>::translated(d, dx));
-    }
-
-    /** Make an image which refers to the same data with using
-     * translated coordinates along the first N dimensions. Positive
-     * values move the image data to the right or down relative to the
-     * coordinate system. */
-    Image<T, D> translated(const std::vector<int> &delta) const {
-        return Image<T, D>(Buffer<D>::translated(delta));
-    }
-
-    /** Make an image which refers to the same data using a different
-     * ordering of the dimensions. */
-    Image<T, D> transposed(int d1, int d2) const {
-        return Image<T, D>(Buffer<D>::transposed(d1, d2));
-    }
-
-    /** Make a lower-dimensional image that refers to one slice of this
-     * image. */
-    Image<T, D-1> sliced(int d, int pos) const {
-        return Image<T, D-1>(Buffer<D>::sliced(d, pos));
-    }
-
-    /** Make a higher-dimensional image in which this image is one
-     * slice. The opposite of sliced. */
-    Image<T, D+1> embedded(int d, int pos) const {
-        return Image<T, D+1>(Buffer<D>::embedded(d, pos));
-    }
-
 };
 
 /** Some helpers for for_each_element. */
@@ -1263,7 +1406,8 @@ void for_each_element(const buffer_t &buf, Fn &&f) {
     for_each_element_helpers<Fn>::for_each_element(0, buf, std::forward<Fn>(f));
 }
 
-}  // namespace Tools
+
+
 }  // namespace Halide
 
 #endif  // HALIDE_RUNTIME_IMAGE_H
diff --git a/test/correctness/constant_type.cpp b/test/correctness/constant_type.cpp
index fa3e868a3de4..411812189507 100644
--- a/test/correctness/constant_type.cpp
+++ b/test/correctness/constant_type.cpp
@@ -16,13 +16,13 @@ bool test_type() {
         return false;
     }
 
-    Expr add_one = im + 1;
+    Expr add_one = im(_) + 1;
     if (add_one.type() != t) {
         std::cout << "Add 1 changed type from " << t << " to " << add_one.type() << "\n";
         return false;
     }
 
-    Expr one_add = 1 + im;
+    Expr one_add = 1 + im(_);
     if (one_add.type() != t) {
         std::cout << "Pre-add 1 changed type from " << t << " to " << one_add.type() << "\n";
         return false;
diff --git a/test/correctness/custom_image_accessor.cpp b/test/correctness/custom_image_accessor.cpp
index 9cd911dd7624..a7dc745d33f9 100644
--- a/test/correctness/custom_image_accessor.cpp
+++ b/test/correctness/custom_image_accessor.cpp
@@ -1,7 +1,7 @@
-#include "halide_image.h"
+#include "Halide.h"
 #include <cmath>
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 // First a very simple example. We'll make it possible to access an
 // image with a custom 3D coordinate type.
@@ -9,8 +9,10 @@ struct Coord {
     int x, y, z;
 };
 
+
 namespace Halide {
-namespace Tools {
+
+/*
 template<typename T, int D>
 struct ImageAccessor<T, D, Coord> {
     // We want to use this accessor to assign to locations too, so
@@ -22,8 +24,18 @@ struct ImageAccessor<T, D, Coord> {
         return im(c.x, c.y, c.z);
     }
 };
+*/
 
+template<typename T, int D>
+T image_accessor(const Image<T, D> &im, Coord c) {
+    return im(c.x, c.y, c.z);
 }
+
+template<typename T, int D>
+T &image_accessor(Image<T, D> &im, Coord c) {
+    return im(c.x, c.y, c.z);
+}
+
 }
 
 // Next we'll use a more complex variadic example. We'll extend
@@ -71,11 +83,11 @@ struct AllFloatConvertible<T, Args...> {
 };
 
 namespace Halide {
-namespace Tools {
 
 // Then we define a partial specialization of
 // Halide::Tools::ImageAccessor that catches any access where the all
 // args are float-convertible.
+/*
 template<int D, typename ...Args>
 struct ImageAccessor<typename std::enable_if<AllFloatConvertible<Args...>::value, float>::type, D, Args...>  {
     float operator()(const Image<float, D> &im, Args... args) {
@@ -83,8 +95,15 @@ struct ImageAccessor<typename std::enable_if<AllFloatConvertible<Args...>::value
         return MultiLinearSampler<D, sizeof...(args)>()(im, coords);
     }
 };
+*/
 
+template<int D, typename ...Args>
+typename std::enable_if<AllFloatConvertible<Args...>::value, float>::type
+image_accessor(const Image<float, D> &im, Args... args) {
+    float coords[] = {float(args)...};
+    return MultiLinearSampler<D, sizeof...(args)>()(im, coords);
 }
+
 }
 
 int main(int argc, char **argv) {
diff --git a/test/correctness/div_mod.cpp b/test/correctness/div_mod.cpp
index 3da609d6177b..4194ef30f744 100644
--- a/test/correctness/div_mod.cpp
+++ b/test/correctness/div_mod.cpp
@@ -312,7 +312,7 @@ bool div_mod(int vector_width, ScheduleVariant scheduling, const Target &target)
             f.compute_root().hexagon();
             break;
     };
-     
+
     Realization R = f.realize(WIDTH, HEIGHT, target);
     Image<T> q(R[0]);
     Image<T> r(R[1]);
@@ -395,7 +395,7 @@ bool f_mod() {
 
     // Compute modulus result and check it.
     Func f;
-    f(_) = a % b;  // Using Halide mod operation.
+    f(_) = a(_) % b(_);  // Using Halide mod operation.
     f.realize(out);
 
     // Explicit checks of the simplifier for consistency with runtime computation
diff --git a/test/correctness/for_each_element.cpp b/test/correctness/for_each_element.cpp
index a034a167a95f..e622b2e711d1 100644
--- a/test/correctness/for_each_element.cpp
+++ b/test/correctness/for_each_element.cpp
@@ -1,6 +1,6 @@
-#include "halide_image.h"
+#include "Halide.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 int main(int argc, char **argv) {
     // Try several different ways of accessing a the pixels of an image,
diff --git a/test/correctness/gpu_non_contiguous_copy.cpp b/test/correctness/gpu_non_contiguous_copy.cpp
index e81d85825320..50d96a8e1950 100644
--- a/test/correctness/gpu_non_contiguous_copy.cpp
+++ b/test/correctness/gpu_non_contiguous_copy.cpp
@@ -26,7 +26,7 @@ int main(int argc, char **argv) {
     cropped.stride[1] *= 2;
     cropped.stride[2] *= 2;
     cropped.stride[3] *= 2;
-    Buffer out(Int(32), &cropped);
+    Image<int32_t> out(cropped);
 
     // Make a bitmask representing the region inside the crop.
     Image<bool> in_subregion(80, 60, 10, 10);
diff --git a/test/correctness/input_larger_than_two_gigs.cpp b/test/correctness/input_larger_than_two_gigs.cpp
index a90c01ec6fcd..737f6443599a 100644
--- a/test/correctness/input_larger_than_two_gigs.cpp
+++ b/test/correctness/input_larger_than_two_gigs.cpp
@@ -25,7 +25,7 @@ int main(int argc, char **argv) {
     buf.stride[2] = 0;
     buf.elem_size = 1;
 
-    Buffer param_buf(UInt(8), &buf);
+    Image<uint8_t> param_buf(buf);
     ImageParam input(UInt(8), 3);
     input.set(param_buf);
 
diff --git a/test/correctness/interleave.cpp b/test/correctness/interleave.cpp
index 2c8b87082333..435ac294ec12 100644
--- a/test/correctness/interleave.cpp
+++ b/test/correctness/interleave.cpp
@@ -145,23 +145,19 @@ int main(int argc, char **argv) {
             .set_stride(1, 1)
             .set_extent(1, 3);
 
-        Buffer buff3;
-        buff3 = Buffer(Float(32), 16, 3, 0, 0, (uint8_t *)0);
-        buff3.raw_buffer()->stride[0] = 3;
-        buff3.raw_buffer()->stride[1] = 1;
+        Image<float> buff3(3, 16);
+        buff3.transpose(0, 1);
 
-        Realization r3({buff3});
-        interleaved.realize(r3);
+        interleaved.realize(buff3);
 
         check_interleave_count(interleaved, 1);
 
-        Image<float> result3 = r3[0];
         for (int x = 0; x < 16; x++) {
             for (int y = 0; y < 3; y++) {
                 float correct = 3*x + y;
-                float delta = result3(x,y) - correct;
+                float delta = buff3(x, y) - correct;
                 if (delta > 0.01 || delta < -0.01) {
-                    printf("result(%d) = %f instead of %f\n", x, result3(x,y), correct);
+                    printf("result(%d) = %f instead of %f\n", x, buff3(x,y), correct);
                     return -1;
                 }
             }
@@ -197,21 +193,17 @@ int main(int argc, char **argv) {
 
         check_interleave_count(output4, 1);
 
-        Buffer buff4;
-        buff4 = Buffer(Float(32), 16, 4, 0, 0, (uint8_t *)0);
-        buff4.raw_buffer()->stride[0] = 4;
-        buff4.raw_buffer()->stride[1] = 1;
+        Image<float> buff4(4, 16);
+        buff4.transpose(0, 1);
 
-        Realization r4({buff4});
-        output4.realize(r4);
+        output4.realize(buff4);
 
-        Image<float> result4 = r4[0];
         for (int x = 0; x < 16; x++) {
             for (int y = 0; y < 4; y++) {
                 float correct = sin((y+1)*x);
-                float delta = result4(x,y) - correct;
+                float delta = buff4(x, y) - correct;
                 if (delta > 0.01 || delta < -0.01) {
-                    printf("result(%d) = %f instead of %f\n", x, result4(x,y), correct);
+                    printf("result(%d) = %f instead of %f\n", x, buff4(x,y), correct);
                     return -1;
                 }
             }
@@ -240,21 +232,17 @@ int main(int argc, char **argv) {
 
         check_interleave_count(output5, 1);
 
-        Buffer buff5;
-        buff5 = Buffer(Float(32), 16, 5, 0, 0, (uint8_t *)0);
-        buff5.raw_buffer()->stride[0] = 5;
-        buff5.raw_buffer()->stride[1] = 1;
+        Image<float> buff5(5, 16);
+        buff5.transpose(0, 1);
 
-        Realization r5({buff5});
-        output5.realize(r5);
+        output5.realize(buff5);
 
-        Image<float> result5 = r5[0];
         for (int x = 0; x < 16; x++) {
             for (int y = 0; y < 5; y++) {
                 float correct = sin((y+1)*x);
-                float delta = result5(x,y) - correct;
+                float delta = buff5(x, y) - correct;
                 if (delta > 0.01 || delta < -0.01) {
-                    printf("result(%d) = %f instead of %f\n", x, result5(x,y), correct);
+                    printf("result(%d) = %f instead of %f\n", x, buff5(x,y), correct);
                     return -1;
                 }
             }
@@ -382,8 +370,8 @@ int main(int argc, char **argv) {
             .set_stride(0,1).set_stride(1,8)
             .set_extent(0,8).set_extent(1,8);
 
-        Image<uint16_t> result6(8,8);
-        Image<uint16_t> result7(8,8);
+        Image<uint16_t> result6(8, 8);
+        Image<uint16_t> result7(8, 8);
         trans1.realize(result6);
         trans2.realize(result7);
 
diff --git a/test/correctness/interleave_rgb.cpp b/test/correctness/interleave_rgb.cpp
index f3cca9176c2b..9b66a71f2ddd 100644
--- a/test/correctness/interleave_rgb.cpp
+++ b/test/correctness/interleave_rgb.cpp
@@ -25,20 +25,14 @@ bool test_interleave() {
     } else {
         interleaved.vectorize(x, target.natural_vector_size<uint8_t>()).unroll(c);
     }
-    Buffer buff(type_of<T>(), 256, 128, 3);
-    buff.raw_buffer()->stride[0] = 3;
-    buff.raw_buffer()->stride[1] = 3 * buff.extent(0);
-    buff.raw_buffer()->stride[2] = 1;
-
-    Realization r({buff});
-    interleaved.realize(r, target);
-    Image<T> out = r[0];
-    for (int y = 0; y < out.height(); y++) {
-        for (int x = 0; x < out.width(); x++) {
+    Image<T> buff = Image<T>::make_interleaved(256, 128, 3);
+    interleaved.realize(buff, target);
+    for (int y = 0; y < buff.height(); y++) {
+        for (int x = 0; x < buff.width(); x++) {
             for (int c = 0; c < 3; c++) {
                 T correct = x * 3 + y * 5 + c;
-                if (out(x, y, c) != correct) {
-                    printf("out(%d, %d, %d) = %d instead of %d\n", x, y, c, out(x, y, c), correct);
+                if (buff(x, y, c) != correct) {
+                    printf("out(%d, %d, %d) = %d instead of %d\n", x, y, c, buff(x, y, c), correct);
                     return false;
                 }
             }
diff --git a/test/correctness/math.cpp b/test/correctness/math.cpp
index 46dd39f950d4..a617bd145a2f 100644
--- a/test/correctness/math.cpp
+++ b/test/correctness/math.cpp
@@ -63,7 +63,7 @@ uint32_t absd(uint32_t a, uint32_t b) { return a < b ? b - a : a - b; }
         Var x("x");                                                           \
         ImageParam input(type_of<type>(), 1);                                 \
         test_##name(x) = name(input(x));                                      \
-        Buffer in_buffer(type_of<type>(), in_buf);                            \
+        Image<type> in_buffer(*in_buf);                                       \
         input.set(in_buffer);                                                 \
         if (target.has_gpu_feature()) {                                       \
             test_##name.gpu_tile(x, 8);                                       \
@@ -89,7 +89,7 @@ uint32_t absd(uint32_t a, uint32_t b) { return a < b ? b - a : a - b; }
         Var x("x");                                                                 \
         ImageParam input(type_of<type>(), 2);                                       \
         test_##name(x) = name(input(0, x), input(1, x));                            \
-        Buffer in_buffer(type_of<type>(), in_buf);                                  \
+        Image<type> in_buffer(*in_buf);                                             \
         input.set(in_buffer);                                                       \
         if (target.has_gpu_feature()) {                                             \
           test_##name.gpu_tile(x, 8);                                               \
diff --git a/test/correctness/min_extent.cpp b/test/correctness/min_extent.cpp
index 182aebea4d38..5548d90631a7 100644
--- a/test/correctness/min_extent.cpp
+++ b/test/correctness/min_extent.cpp
@@ -21,12 +21,8 @@ int main(int argc, char **argv) {
     // but we initialize them anyway.
     Image<int> input(5);
     Image<int> out(10);
-    for (int i = 0; i < input.width(); i++) {
-        input(i) = 0;
-    }
-    for (int i = 0; i < out.width(); i++) {
-        out(i) = 0;
-    }
+    input.fill(0);
+    out.fill(0);
 
     // Change coordinate origin of input and output buffer so that they are
     // aligned as follows:
@@ -34,14 +30,14 @@ int main(int argc, char **argv) {
     // out     |-----------------|
     const int INOFF = 4;
     const int OUTOFF = 1;
-    in.set(input);
     input.set_min(INOFF);
     out.set_min(OUTOFF);
+    in.set(input);
     f.realize(out);
 
     // Check correctness of result
     int expected[] = { -10, -20, -30, 4, 5, 6, 7, 8, 90, 100 };
-    for (int i=0; i<out.width(); i++) {
+    for (int i = 0; i < out.width(); i++) {
         if (out(i + OUTOFF) != expected[i]) {
             printf("Unexpected output: %d != %d\n", out(i + OUTOFF), expected[i]);
             return -1;
diff --git a/test/correctness/multipass_constraints.cpp b/test/correctness/multipass_constraints.cpp
index 921a0a5a4291..c5745262e579 100644
--- a/test/correctness/multipass_constraints.cpp
+++ b/test/correctness/multipass_constraints.cpp
@@ -20,15 +20,9 @@ int main(int argc, char **argv) {
 
     o.set_bounds(0, 0, select(o.extent(0) < 22, o.extent(0) + 1, o.extent(0)));
 
-    // The only way to build a query buffer for output bounds right
-    // now is to make a buffer_t manually.
-    buffer_t out_buffer_t = {0};
-    out_buffer_t.min[0] = 2;
-    out_buffer_t.extent[0] = 7;
-    out_buffer_t.min[1] = 2;
-    out_buffer_t.extent[1] = 8;
-
-    Buffer out_buf(Float(32), &out_buffer_t, "out_buf");
+    // Make a bounds query buffer
+    Image<float> out_buf(nullptr, 7, 8);
+    out_buf.set_min(2, 2);
 
     out.infer_input_bounds(out_buf);
 
diff --git a/test/correctness/output_larger_than_two_gigs.cpp b/test/correctness/output_larger_than_two_gigs.cpp
index da85bc5ae763..fe97381c698a 100644
--- a/test/correctness/output_larger_than_two_gigs.cpp
+++ b/test/correctness/output_larger_than_two_gigs.cpp
@@ -31,7 +31,7 @@ int main(int argc, char **argv) {
 
     identity_uint8.set_error_handler(&halide_error);
 
-    Buffer output_buf(UInt(8), &buf);
+    Image<uint8_t> output_buf(buf);
     Target t = get_jit_target_from_environment();
 
     if (t.bits != 32) {
diff --git a/test/correctness/process_some_tiles.cpp b/test/correctness/process_some_tiles.cpp
index e3d3a1b24067..b13cea3f8715 100644
--- a/test/correctness/process_some_tiles.cpp
+++ b/test/correctness/process_some_tiles.cpp
@@ -68,7 +68,8 @@ int main(int argc, char **argv) {
     output.compile_jit();
 
     Image<bool> bitmap_buf(10, 10);
-    bitmap_buf(5, 5) = 1;
+    bitmap_buf.fill(false);
+    bitmap_buf(5, 5) = true;
     bitmap.set(bitmap_buf);
 
     Image<float> image_buf = lambda(x, y, (sin(x+y)+1)/2).realize(10 * tile_size, 10 * tile_size);
diff --git a/test/correctness/realize_over_shifted_domain.cpp b/test/correctness/realize_over_shifted_domain.cpp
index dc10035e4501..9174bd479ba5 100644
--- a/test/correctness/realize_over_shifted_domain.cpp
+++ b/test/correctness/realize_over_shifted_domain.cpp
@@ -4,7 +4,7 @@
 using namespace Halide;
 
 int main(int argc, char **argv) {
-    Image<int> input(100, 50, "input");
+    Image<int> input(100, 50);
 
     // This image represents the range [100, 199]*[50, 99]
     input.set_min(100, 50);
diff --git a/test/correctness/rfactor.cpp b/test/correctness/rfactor.cpp
index e588f669aeab..9e77d38a94f5 100644
--- a/test/correctness/rfactor.cpp
+++ b/test/correctness/rfactor.cpp
@@ -455,13 +455,15 @@ int histogram_rfactor_test(bool compile_module) {
             reference_hist[uint8_t(in(x, y))] += 1;
         }
     }
+    // Wrap the image in a buffer, so that we know its name.
+    Buffer in_buf(in);
 
     Func hist("hist"), g("g");
     Var x("x");
 
     RDom r(in);
     hist(x) = 0;
-    hist(clamp(cast<int>(in(r.x, r.y)), 0, 255)) += 1;
+    hist(clamp(cast<int>(in_buf(r.x, r.y)), 0, 255)) += 1;
     hist.compute_root();
 
     Var u("u");
@@ -482,7 +484,7 @@ int histogram_rfactor_test(bool compile_module) {
             {hist.name(), {}},
             {hist.update(0).name(), {intm.name(), hist.name()}},
             {intm.name(), {}},
-            {intm.update(0).name(), {in.name(), intm.name()}},
+            {intm.update(0).name(), {in_buf.name(), intm.name()}},
 
         };
         if (check_call_graphs(checker.calls, expected) != 0) {
diff --git a/test/correctness/shifted_image.cpp b/test/correctness/shifted_image.cpp
index 66fbdd1eca7a..db00650e77c9 100644
--- a/test/correctness/shifted_image.cpp
+++ b/test/correctness/shifted_image.cpp
@@ -20,7 +20,7 @@ int main(int argc, char **argv) {
     buf.stride[3] = 1000;
     buf.elem_size = 4;
 
-    Image<int> im(&buf);
+    Image<int> im(buf);
 
     ((int *)buf.host)[0] = 17;
     buf.host[0] = 17;
diff --git a/test/correctness/simd_op_check.cpp b/test/correctness/simd_op_check.cpp
index 1ad77e71786d..7b018f9dede7 100644
--- a/test/correctness/simd_op_check.cpp
+++ b/test/correctness/simd_op_check.cpp
@@ -166,8 +166,8 @@ void check(string op, int vector_width, Expr e) {
 
     bool can_run_the_code = can_run_code();
     if (can_run_the_code) {
-        Realization r = error.realize(0, target.without_feature(Target::NoRuntime));
-        double e = Image<double>(r[0])(0);
+        Realization r = error.realize(target.without_feature(Target::NoRuntime));
+        double e = Image<double>(r[0])();
         // Use a very loose tolerance for floating point tests. The
         // kinds of bugs we're looking for are codegen bugs that
         // return the wrong value entirely, not floating point
diff --git a/test/error/bad_host_alignment.cpp b/test/error/bad_host_alignment.cpp
index 332d61c7fce5..29e7740fedb5 100644
--- a/test/error/bad_host_alignment.cpp
+++ b/test/error/bad_host_alignment.cpp
@@ -8,21 +8,11 @@ IRPrinter irp(std::cerr);
 int main(int argc, char **argv) {
     Func f;
     Var x, y;
-    int arr[11][10];
-    uint8_t *ptr  = reinterpret_cast<uint8_t*>(arr);
-    ptr += 1;
-    buffer_t buf;
-    buf.host = ptr;
-    buf.extent[0] = 10;
-    buf.extent[1] = 10;
-    buf.stride[0] = 1;
-    buf.stride[1] = 10;
-    buf.elem_size = 1;
-    buf.min[0] = 0;
-    buf.min[1] = 0;
-    Buffer param_buf(UInt(8), &buf);
     ImageParam in(UInt(8), 2);
 
+    Image<uint8_t, 2> param_buf(11, 10);
+    param_buf.crop(0, 1, 10);
+
     in.set_host_alignment(512);
     f(x, y) = in(x, y);
     f.compute_root();
diff --git a/test/error/buffer_larger_than_two_gigs.cpp b/test/error/buffer_larger_than_two_gigs.cpp
index c84386bb9291..d6d97fa415a5 100644
--- a/test/error/buffer_larger_than_two_gigs.cpp
+++ b/test/error/buffer_larger_than_two_gigs.cpp
@@ -4,9 +4,9 @@
 using namespace Halide;
 int main(int argc, char **argv) {
     if (sizeof(void *) == 8) {
-        Buffer result(UInt(8), 1 << 24, 1 << 24, 1 << 24);
+        Image<uint8_t> result(1 << 24, 1 << 24, 1 << 24);
     } else {
-        Buffer result(UInt(8), 1 << 12, 1 << 12, 1 << 8);
+        Image<uint8_t> result(1 << 12, 1 << 12, 1 << 8);
     }
     printf("Success!\n");
 }
diff --git a/test/generator/acquire_release_aottest.cpp b/test/generator/acquire_release_aottest.cpp
index b191a1b1a91d..e3f36efa8f39 100644
--- a/test/generator/acquire_release_aottest.cpp
+++ b/test/generator/acquire_release_aottest.cpp
@@ -10,14 +10,14 @@ int main(int argc, char **argv) {
 
 #include <math.h>
 #include "HalideRuntime.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 #include <assert.h>
 #include <string.h>
 
 #include "acquire_release.h"
 
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 const int W = 256, H = 256;
 
diff --git a/test/generator/argvcall_aottest.cpp b/test/generator/argvcall_aottest.cpp
index 4ed40f22311d..03892e89f469 100644
--- a/test/generator/argvcall_aottest.cpp
+++ b/test/generator/argvcall_aottest.cpp
@@ -1,12 +1,12 @@
 #include "HalideRuntime.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 
 #include <math.h>
 #include <stdio.h>
 
 #include "argvcall.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 const int kSize = 32;
 
diff --git a/test/generator/cleanup_on_error_aottest.cpp b/test/generator/cleanup_on_error_aottest.cpp
index ddba4fa2c54d..88a29addabf4 100644
--- a/test/generator/cleanup_on_error_aottest.cpp
+++ b/test/generator/cleanup_on_error_aottest.cpp
@@ -1,5 +1,5 @@
 #include "HalideRuntime.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 
 // Grab the internal device_interface functions
 #define WEAK
@@ -10,13 +10,13 @@
 
 #include "cleanup_on_error.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 const int size = 64;
 
 int successful_mallocs = 0, failed_mallocs = 0, frees = 0, errors = 0, device_mallocs = 0, device_frees = 0;
 
- void *my_halide_malloc(void *user_context, size_t x) {
+void *my_halide_malloc(void *user_context, size_t x) {
     // Only the first malloc succeeds
     if (successful_mallocs) {
         failed_mallocs++;
diff --git a/test/generator/cxx_mangling_aottest.cpp b/test/generator/cxx_mangling_aottest.cpp
index 8f61819668c6..12a1320dcd46 100644
--- a/test/generator/cxx_mangling_aottest.cpp
+++ b/test/generator/cxx_mangling_aottest.cpp
@@ -1,13 +1,13 @@
 #include <stdio.h>
 
 #include "HalideRuntime.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 #include <assert.h>
 #include <string.h>
 
 #include "cxx_mangling.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 int32_t extract_value_global(int32_t *arg) {
     return *arg;
diff --git a/test/generator/cxx_mangling_define_extern_aottest.cpp b/test/generator/cxx_mangling_define_extern_aottest.cpp
index 823b81e0dcb4..265fa93f8cdb 100644
--- a/test/generator/cxx_mangling_define_extern_aottest.cpp
+++ b/test/generator/cxx_mangling_define_extern_aottest.cpp
@@ -1,14 +1,14 @@
 #include <stdio.h>
 
 #include "HalideRuntime.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 
 #include <assert.h>
 #include <string.h>
 
 #include "cxx_mangling_define_extern.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 int32_t extract_value_global(int32_t *arg) {
     return *arg;
diff --git a/test/generator/embed_image_aottest.cpp b/test/generator/embed_image_aottest.cpp
index 31d29185396d..6274ea764a2d 100644
--- a/test/generator/embed_image_aottest.cpp
+++ b/test/generator/embed_image_aottest.cpp
@@ -2,9 +2,9 @@
 #include <stdio.h>
 
 #include "embed_image.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 int main(int argc, char **argv) {
     Image<float> input(10, 10, 3);
diff --git a/test/generator/example_aottest.cpp b/test/generator/example_aottest.cpp
index 3412279fa164..e16ac188852e 100644
--- a/test/generator/example_aottest.cpp
+++ b/test/generator/example_aottest.cpp
@@ -1,12 +1,12 @@
 #include "HalideRuntime.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 
 #include <math.h>
 #include <stdio.h>
 
 #include "example.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 const int kSize = 32;
 
diff --git a/test/generator/extended_buffer_t_aottest.cpp b/test/generator/extended_buffer_t_aottest.cpp
index b9f5df47c63c..15df4172c6a9 100644
--- a/test/generator/extended_buffer_t_aottest.cpp
+++ b/test/generator/extended_buffer_t_aottest.cpp
@@ -3,9 +3,9 @@
 
 #include "extended_buffer_t_common.h"
 #include "extended_buffer_t.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 int main(int argc, char **argv) {
     Image<float> input(10, 10);
diff --git a/test/generator/gpu_object_lifetime_aottest.cpp b/test/generator/gpu_object_lifetime_aottest.cpp
index 52f92d34b0e6..0a8a4809fad0 100644
--- a/test/generator/gpu_object_lifetime_aottest.cpp
+++ b/test/generator/gpu_object_lifetime_aottest.cpp
@@ -1,7 +1,7 @@
 #include <math.h>
 #include <stdio.h>
 #include "HalideRuntime.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 #include <assert.h>
 
 #if COMPILING_FOR_CUDA
@@ -14,7 +14,7 @@
 
 #include "../common/gpu_object_lifetime.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 void my_halide_print(void *user_context, const char *str) {
     printf("%s", str);
diff --git a/test/generator/gpu_only_aottest.cpp b/test/generator/gpu_only_aottest.cpp
index d65dc45a1a55..7ae8c8ec447e 100644
--- a/test/generator/gpu_only_aottest.cpp
+++ b/test/generator/gpu_only_aottest.cpp
@@ -1,7 +1,7 @@
 #include <math.h>
 #include <stdio.h>
 #include "HalideRuntime.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 #include <assert.h>
 #if defined(TEST_OPENCL)
 #include "HalideRuntimeOpenCL.h"
@@ -10,7 +10,7 @@
 #endif
 
 #include "gpu_only.h"
-using namespace Halide::Tools;
+using namespace Halide;
 
 int main(int argc, char **argv) {
 #if defined(TEST_OPENCL) || defined(TEST_CUDA)
diff --git a/test/generator/image_from_array_aottest.cpp b/test/generator/image_from_array_aottest.cpp
index d8cae5a9d0b9..b21d8697630b 100644
--- a/test/generator/image_from_array_aottest.cpp
+++ b/test/generator/image_from_array_aottest.cpp
@@ -1,4 +1,4 @@
-#include "halide_image.h"
+#include "HalideImage.h"
 
 #include <stdint.h>
 #include <iostream>
@@ -7,7 +7,7 @@
 #include <vector>
 
 using namespace std;
-using namespace Halide::Tools;
+using namespace Halide;
 
 //-----------------------------------------------------------------------------
 // Returns the dimension sizes of a statically sized array from inner to outer.
diff --git a/test/generator/mandelbrot_aottest.cpp b/test/generator/mandelbrot_aottest.cpp
index 77dbad24941f..b2ede4d1f0c0 100644
--- a/test/generator/mandelbrot_aottest.cpp
+++ b/test/generator/mandelbrot_aottest.cpp
@@ -3,9 +3,9 @@
 #include <stdio.h>
 
 #include "mandelbrot.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 int main(int argc, char **argv) {
     Image<int> output(100, 30);
diff --git a/test/generator/memory_profiler_mandelbrot_aottest.cpp b/test/generator/memory_profiler_mandelbrot_aottest.cpp
index 3299267d7e26..c6f52a2c8160 100644
--- a/test/generator/memory_profiler_mandelbrot_aottest.cpp
+++ b/test/generator/memory_profiler_mandelbrot_aottest.cpp
@@ -6,10 +6,10 @@
 #include <stdio.h>
 
 #include "HalideRuntime.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 #include "memory_profiler_mandelbrot.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 using std::map;
 using std::string;
 
diff --git a/test/generator/metadata_tester_aottest.cpp b/test/generator/metadata_tester_aottest.cpp
index 8fbb43a47989..aa770ec73e91 100644
--- a/test/generator/metadata_tester_aottest.cpp
+++ b/test/generator/metadata_tester_aottest.cpp
@@ -1,5 +1,5 @@
 #include "HalideRuntime.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 
 #include <math.h>
 #include <stdio.h>
@@ -10,7 +10,7 @@
 #include "metadata_tester.h"
 #include "metadata_tester_ucon.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 const int kSize = 32;
 
diff --git a/test/generator/multitarget_aottest.cpp b/test/generator/multitarget_aottest.cpp
index 4303b6bdb422..9bb01b1b0250 100644
--- a/test/generator/multitarget_aottest.cpp
+++ b/test/generator/multitarget_aottest.cpp
@@ -3,9 +3,9 @@
 #include <tuple>
 #include "HalideRuntime.h"
 #include "multitarget.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 void my_error_handler(void *user_context, const char *message) {
     printf("Saw Error: (%s)\n", message);
diff --git a/test/generator/pyramid_aottest.cpp b/test/generator/pyramid_aottest.cpp
index e10a5cae80e8..0afef4fcd1bc 100644
--- a/test/generator/pyramid_aottest.cpp
+++ b/test/generator/pyramid_aottest.cpp
@@ -2,11 +2,11 @@
 #include <stdio.h>
 
 #include "pyramid.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 
 #include <vector>
 using std::vector;
-using namespace Halide::Tools;
+using namespace Halide;
 
 int main(int argc, char **argv) {
     Image<float> input(1024, 1024);
diff --git a/test/generator/tiled_blur_aottest.cpp b/test/generator/tiled_blur_aottest.cpp
index a7625460e6fd..7c3b1beb9130 100644
--- a/test/generator/tiled_blur_aottest.cpp
+++ b/test/generator/tiled_blur_aottest.cpp
@@ -1,12 +1,12 @@
 #include <math.h>
 #include <stdio.h>
 #include "HalideRuntime.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 #include <assert.h>
 
 #include "tiled_blur.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 const int W = 80, H = 80;
 
diff --git a/test/generator/tiled_blur_interleaved_aottest.cpp b/test/generator/tiled_blur_interleaved_aottest.cpp
index 284cb64f4a53..648d27035617 100644
--- a/test/generator/tiled_blur_interleaved_aottest.cpp
+++ b/test/generator/tiled_blur_interleaved_aottest.cpp
@@ -1,12 +1,12 @@
 #include <math.h>
 #include <stdio.h>
 #include "HalideRuntime.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 #include <assert.h>
 
 #include "tiled_blur_interleaved.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 const int W = 80, H = 80;
 
diff --git a/test/generator/user_context_aottest.cpp b/test/generator/user_context_aottest.cpp
index 23e11e3510af..5ec05a6aa7ad 100644
--- a/test/generator/user_context_aottest.cpp
+++ b/test/generator/user_context_aottest.cpp
@@ -3,10 +3,10 @@
 #include <assert.h>
 
 #include "HalideRuntime.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 #include "user_context.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 static void *context_pointer = (void *)0xf00dd00d;
 
diff --git a/test/generator/user_context_insanity_aottest.cpp b/test/generator/user_context_insanity_aottest.cpp
index a28c198dab9d..651bf30bd81a 100644
--- a/test/generator/user_context_insanity_aottest.cpp
+++ b/test/generator/user_context_insanity_aottest.cpp
@@ -3,10 +3,10 @@
 #include <assert.h>
 
 #include "HalideRuntime.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 #include "user_context_insanity.h"
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 const int num_launcher_tasks = 1000;
 
diff --git a/test/generator/variable_num_threads_aottest.cpp b/test/generator/variable_num_threads_aottest.cpp
index 8ab5b21e56fb..b39cbd272a2f 100644
--- a/test/generator/variable_num_threads_aottest.cpp
+++ b/test/generator/variable_num_threads_aottest.cpp
@@ -1,5 +1,5 @@
 #include "HalideRuntime.h"
-#include "halide_image.h"
+#include "HalideImage.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -10,7 +10,7 @@
 bool stop = false;
 int max_threads = 1;
 
-using namespace Halide::Tools;
+using namespace Halide;
 
 void mess_with_num_threads(void *) {
     while (!stop) {
diff --git a/test/opengl/lut.cpp b/test/opengl/lut.cpp
index e4534ef6d65d..590cd8454791 100644
--- a/test/opengl/lut.cpp
+++ b/test/opengl/lut.cpp
@@ -13,9 +13,9 @@ int test_lut1d() {
     Var y("y");
     Var c("c");
 
-    Image<uint8_t> input(8, 8, 3, "input");
-    for (int y=0; y<input.height(); y++) {
-        for (int x=0; x<input.width(); x++) {
+    Image<uint8_t> input(8, 8, 3);
+    for (int y = 0; y < input.height(); y++) {
+        for (int x = 0; x < input.width(); x++) {
             float v = (1.0f/16.0f) + (float)x/8.0f;
             input(x, y, 0) = (uint8_t)(v * 255.0f);
             input(x, y, 1) = (uint8_t)((1.0f - v)*255.0f);
@@ -24,7 +24,7 @@ int test_lut1d() {
     }
 
     // 1D Look Up Table case
-    Image<float> lut1d(8, 1, 3, "lut1d");
+    Image<float> lut1d(8, 1, 3);
     for (int c = 0; c != 3; ++c) {
         for (int i = 0; i != 8; ++i) {
             lut1d(i, 0, c) = (float)(1 + i);
@@ -36,7 +36,7 @@ int test_lut1d() {
 
     f0(x, y, c) = lut1d(clamp(e, 0, 7), 0, c);
 
-    Image<float> out0(8, 8, 3,"out");
+    Image<float> out0(8, 8, 3);
 
     f0.bound(c, 0, 3);
     f0.glsl(x, y, c);
diff --git a/test/opengl/produce.cpp b/test/opengl/produce.cpp
index 2384496562e5..2de9155eaf40 100644
--- a/test/opengl/produce.cpp
+++ b/test/opengl/produce.cpp
@@ -14,7 +14,7 @@ int test_lut1d() {
     Var y("y");
     Var c("c");
 
-    Image<uint8_t> input(8, 8, 3, "input");
+    Image<uint8_t> input(8, 8, 3);
     for (int y = 0; y < input.height(); y++) {
         for (int x = 0; x < input.width(); x++) {
             float v = (1.0f / 16.0f) + (float)x / 8.0f;
@@ -37,7 +37,7 @@ int test_lut1d() {
     f0.bound(c, 0, 3);
     f0.glsl(x, y, c);
 
-    Image<float> out0(8, 8, 3, "out");
+    Image<float> out0(8, 8, 3);
     f0.realize(out0);
 
     out0.copy_to_host();
diff --git a/test/opengl/rewrap_texture.cpp b/test/opengl/rewrap_texture.cpp
index 870eccc4c468..d37b43acea24 100644
--- a/test/opengl/rewrap_texture.cpp
+++ b/test/opengl/rewrap_texture.cpp
@@ -37,9 +37,9 @@ int main() {
     const int height = 10;
 
     Image<uint8_t> input(width, height, 3);
-    Buffer out1(UInt(8), width, height, 3);
-    Buffer out2(UInt(8), width, height, 3);
-    Buffer out3(UInt(8), width, height, 3);
+    Image<uint8_t> out1(width, height, 3);
+    Image<uint8_t> out2(width, height, 3);
+    Image<uint8_t> out3(width, height, 3);
 
     Var x, y, c;
     Func g;
diff --git a/test/opengl/save_state.cpp b/test/opengl/save_state.cpp
index 39bf45626d09..8493297fc940 100644
--- a/test/opengl/save_state.cpp
+++ b/test/opengl/save_state.cpp
@@ -277,7 +277,7 @@ int main() {
     KnownState known_state;
 
     Image<uint8_t> input(255, 10, 3);
-    Buffer out(UInt(8), 255, 10, 3);
+    Image<uint8_t> out(UInt(8), 255, 10, 3);
 
     Var x, y, c;
     Func g;
diff --git a/test/performance/boundary_conditions.cpp b/test/performance/boundary_conditions.cpp
index 4a152f72181b..e0e394752447 100644
--- a/test/performance/boundary_conditions.cpp
+++ b/test/performance/boundary_conditions.cpp
@@ -28,11 +28,10 @@ struct Test {
 
         Image<float> out = g.realize(W, H);
 
-        Buffer buf(out);
         // best of 10 x 5 runs.
         time = benchmark(10, 5, [&]() {
-                g.realize(buf);
-                buf.device_sync();
+                g.realize(out);
+                out.device_sync();
         });
 
         printf("%-20s: %f us\n", name, time * 1e6);
@@ -55,10 +54,9 @@ struct Test {
         Image<float> out = g.realize(W, H);
 
         // best of 3 x 3 runs.
-        Buffer buf(out);
         time = benchmark(3, 3, [&]() {
-                g.realize(buf);
-                buf.device_sync();
+                g.realize(out);
+                out.device_sync();
         });
 
         printf("%-20s: %f us\n", name, time * 1e6);
diff --git a/test/performance/packed_planar_fusion.cpp b/test/performance/packed_planar_fusion.cpp
index 1dc9fb0e128c..c16f45f38f16 100644
--- a/test/performance/packed_planar_fusion.cpp
+++ b/test/performance/packed_planar_fusion.cpp
@@ -39,29 +39,11 @@ double test_copy(Image<uint8_t> src, Image<uint8_t> dst) {
 }
 
 Image<uint8_t> make_packed(uint8_t *host, int W, int H) {
-    buffer_t buf = {0};
-    buf.host = host;
-    buf.extent[0] = W;
-    buf.stride[0] = 3;
-    buf.extent[1] = H;
-    buf.stride[1] = buf.stride[0] * buf.extent[0];
-    buf.extent[2] = 3;
-    buf.stride[2] = 1;
-    buf.elem_size = 1;
-    return Image<uint8_t>(&buf);
+    return Image<uint8_t>::make_interleaved(host, W, H, 3);
 }
 
 Image<uint8_t> make_planar(uint8_t *host, int W, int H) {
-    buffer_t buf = {0};
-    buf.host = host;
-    buf.extent[0] = W;
-    buf.stride[0] = 1;
-    buf.extent[1] = H;
-    buf.stride[1] = buf.stride[0] * buf.extent[0];
-    buf.extent[2] = 3;
-    buf.stride[2] = buf.stride[1] * buf.extent[1];
-    buf.elem_size = 1;
-    return Image<uint8_t>(&buf);
+    return Image<uint8_t>(host, W, H, 3);
 }
 
 int main(int argc, char **argv) {
diff --git a/test/performance/rgb_interleaved.cpp b/test/performance/rgb_interleaved.cpp
index 2236b946c6c8..6cd614a5fc62 100644
--- a/test/performance/rgb_interleaved.cpp
+++ b/test/performance/rgb_interleaved.cpp
@@ -12,63 +12,32 @@ void test_deinterleave() {
 
     dst(x, y, c) = src(x, y, c);
 
-    src.set_stride(0, 3);
-    src.set_stride(2, 1);
-    src.set_extent(2, 3);
+    src.dim(0).set_stride(3)
+        .dim(2).set_stride(1).set_bounds(0, 3);
 
     // This is the default format for Halide, but made explicit for illustration.
-    dst.output_buffer().set_stride(0, 1);
-    dst.output_buffer().set_extent(2, 3);
+    dst.output_buffer()
+        .dim(0).set_stride(1)
+        .dim(2).set_extent(3);
 
     dst.reorder(c, x, y).unroll(c);
     dst.vectorize(x, 16);
 
-    // Run test many times to avoid timing jitter
-    const int iterations = 20;
 
     // Allocate two 16 megapixel, 3 channel, 8-bit images -- input and output
-    const int32_t buffer_side_length = (1 << 12);
-    const int32_t buffer_size = buffer_side_length * buffer_side_length;
-
-    uint8_t *src_storage(new uint8_t[buffer_size * 3]);
-    uint8_t *dst_storage(new uint8_t[buffer_size * 3]);
-
-    buffer_t src_buffer;
-    buffer_t dst_buffer;
 
     // Setup src to be RGB interleaved, with no extra padding between channels or rows.
-    memset(&src_buffer, 0, sizeof(src_buffer));
-    src_buffer.host = src_storage;
-    src_buffer.extent[0] = buffer_side_length;
-    src_buffer.stride[0] = 3;
-    src_buffer.extent[1] = buffer_side_length;
-    src_buffer.stride[1] = src_buffer.stride[0] * src_buffer.extent[0];
-    src_buffer.extent[2] = 3;
-    src_buffer.stride[2] = 1;
-    src_buffer.elem_size = 1;
+    Image<uint8_t> src_image = Image<uint8_t>::make_interleaved(1 << 12, 1 << 12, 3);
 
     // Setup dst to be planar, with no extra padding between channels or rows.
-    memset(&dst_buffer, 0, sizeof(dst_buffer));
-    dst_buffer.host = dst_storage;
-    dst_buffer.extent[0] = buffer_side_length;
-    dst_buffer.stride[0] = 1;
-    dst_buffer.extent[1] = buffer_side_length;
-    dst_buffer.stride[1] = dst_buffer.stride[0] * dst_buffer.extent[0];
-    dst_buffer.extent[2] = 3;
-    dst_buffer.stride[2] = dst_buffer.stride[1] * dst_buffer.extent[1];
-    dst_buffer.elem_size = 1;
-
-    Image<uint8_t> src_image(&src_buffer, "src_image");
-    Image<uint8_t> dst_image(&dst_buffer, "dst_image");
-
-    for (int32_t x = 0; x < buffer_side_length; x++) {
-        for (int32_t y = 0; y < buffer_side_length; y++) {
-          src_image(x, y, 0) = 0;
-          src_image(x, y, 1) = 128;
-          src_image(x, y, 2) = 255;
-        }
-    }
-    memset(dst_storage, 0, buffer_size);
+    Image<uint8_t> dst_image(1 << 12, 1 << 12, 3);
+
+    src_image.for_each_element([&](int x, int y) {
+            src_image(x, y, 0) = 0;
+            src_image(x, y, 1) = 128;
+            src_image(x, y, 2) = 255;
+        });
+    dst_image.fill(0);
 
     src.set(src_image);
 
@@ -77,49 +46,36 @@ void test_deinterleave() {
     // Warm up caches, etc.
     dst.realize(dst_image);
 
-    double t1 = benchmark(1, iterations, [&]() {
+    double t1 = benchmark(1, 20, [&]() {
         dst.realize(dst_image);
     });
 
-    printf("Interleaved to planar bandwidth %.3e byte/s.\n", buffer_size / t1);
+    printf("Interleaved to planar bandwidth %.3e byte/s.\n",
+           dst_image.number_of_elements() / t1);
 
-    for (int32_t x = 0; x < buffer_side_length; x++) {
-        for (int32_t y = 0; y < buffer_side_length; y++) {
+    dst_image.for_each_element([&](int x, int y) {
             assert(dst_image(x, y, 0) == 0);
             assert(dst_image(x, y, 1) == 128);
             assert(dst_image(x, y, 2) == 255);
-        }
-    }
+        });
 
     // Setup a semi-planar output case.
-    memset(&dst_buffer, 0, sizeof(dst_buffer));
-    dst_buffer.host = dst_storage;
-    dst_buffer.extent[0] = buffer_side_length;
-    dst_buffer.stride[0] = 1;
-    dst_buffer.extent[1] = buffer_side_length;
-    dst_buffer.stride[1] = dst_buffer.stride[0] * dst_buffer.extent[0] * 3;
-    dst_buffer.extent[2] = 3;
-    dst_buffer.stride[2] = dst_buffer.extent[0];
-    dst_buffer.elem_size = 1;
-
-    memset(dst_storage, 0, buffer_size);
-
-    double t2 = benchmark(1, iterations, [&]() {
+    dst_image = Image<uint8_t>(1 << 12, 3, 1 << 12);
+    dst_image.transpose(1, 2);
+    dst_image.fill(0);
+
+    double t2 = benchmark(1, 20, [&]() {
         dst.realize(dst_image);
     });
 
-    for (int32_t x = 0; x < buffer_side_length; x++) {
-        for (int32_t y = 0; y < buffer_side_length; y++) {
+    dst_image.for_each_element([&](int x, int y) {
             assert(dst_image(x, y, 0) == 0);
             assert(dst_image(x, y, 1) == 128);
             assert(dst_image(x, y, 2) == 255);
-        }
-    }
+        });
 
-    printf("Interleaved to semi-planar bandwidth %.3e byte/s.\n", buffer_size / t2);
-
-    delete[] src_storage;
-    delete[] dst_storage;
+    printf("Interleaved to semi-planar bandwidth %.3e byte/s.\n",
+           dst_image.number_of_elements() / t2);
 }
 
 void test_interleave(bool fast) {
@@ -130,13 +86,11 @@ void test_interleave(bool fast) {
     dst(x, y, c) = src(x, y, c);
 
     // This is the default format for Halide, but made explicit for illustration.
-    src.set_stride(0, 1);
-    src.set_extent(2, 3);
+    src.dim(0).set_stride(1).dim(2).set_extent(3);
 
-    dst.output_buffer().set_min(2, 0);
-    dst.output_buffer().set_stride(0, 3);
-    dst.output_buffer().set_stride(2, 1);
-    dst.output_buffer().set_extent(2, 3);
+    dst.output_buffer()
+        .dim(0).set_stride(3)
+        .dim(2).set_stride(1).set_bounds(0, 3);
 
     if( fast ) {
         dst.reorder(c, x, y).bound(c, 0, 3).unroll(c);
@@ -145,81 +99,44 @@ void test_interleave(bool fast) {
         dst.reorder(c, x, y).vectorize(x, 16);
     }
 
-    // Run test many times to avoid timing jitter
-    const int iterations = 20;
-
     // Allocate two 16 megapixel, 3 channel, 8-bit images -- input and output
-    const int32_t buffer_side_length = (1 << 12);
-    const int32_t buffer_size = buffer_side_length * buffer_side_length;
 
-    uint8_t *src_storage(new uint8_t[buffer_size * 3]);
-    uint8_t *dst_storage(new uint8_t[buffer_size * 3]);
+    // Setup src to be planar
+    Image<uint8_t> src_image(1 << 12, 1 << 12, 3);
 
-    buffer_t src_buffer;
-    buffer_t dst_buffer;
+    // Setup dst to be interleaved
+    Image<uint8_t> dst_image = Image<uint8_t>::make_interleaved(1 << 12, 1 << 12, 3);
 
-    // Setup src to be RGB interleaved, with no extra padding between channels or rows.
-    memset(&src_buffer, 0, sizeof(src_buffer));
-    src_buffer.host = src_storage;
-    src_buffer.extent[0] = buffer_side_length;
-    src_buffer.stride[0] = 1;
-    src_buffer.extent[1] = buffer_side_length;
-    src_buffer.stride[1] = src_buffer.stride[0] * src_buffer.extent[0];
-    src_buffer.extent[2] = 3;
-    src_buffer.stride[2] = src_buffer.stride[1] * src_buffer.extent[1];
-    src_buffer.elem_size = 1;
-
-    // Setup dst to be planar, with no extra padding between channels or rows.
-    memset(&dst_buffer, 0, sizeof(dst_buffer));
-    dst_buffer.host = dst_storage;
-    dst_buffer.extent[0] = buffer_side_length;
-    dst_buffer.stride[0] = 3;
-    dst_buffer.extent[1] = buffer_side_length;
-    dst_buffer.stride[1] = dst_buffer.stride[0] * dst_buffer.extent[0];
-    dst_buffer.extent[2] = 3;
-    dst_buffer.stride[2] = 1;
-    dst_buffer.elem_size = 1;
-
-    Image<uint8_t> src_image(&src_buffer, "src_image");
-    Image<uint8_t> dst_image(&dst_buffer, "dst_image");
-
-    for (int32_t x = 0; x < buffer_side_length; x++) {
-        for (int32_t y = 0; y < buffer_side_length; y++) {
-          src_image(x, y, 0) = 0;
-          src_image(x, y, 1) = 128;
-          src_image(x, y, 2) = 255;
-        }
-    }
-    memset(dst_storage, 0, buffer_size);
+    src_image.for_each_element([&](int x, int y) {
+            src_image(x, y, 0) = 0;
+            src_image(x, y, 1) = 128;
+            src_image(x, y, 2) = 255;
+        });
+    dst_image.fill(0);
 
     src.set(src_image);
 
-    if( fast ) {
+    if (fast) {
         dst.compile_to_lowered_stmt("rgb_interleave_fast.stmt", dst.infer_arguments());
     } else {
         dst.compile_to_lowered_stmt("rgb_interleave_slow.stmt", dst.infer_arguments());
     }
-    dst.compile_jit();
 
     // Warm up caches, etc.
     dst.realize(dst_image);
 
-    double t = benchmark(1, iterations, [&]() {
+    double t = benchmark(1, 20, [&]() {
         dst.realize(dst_image);
     });
 
-    printf("Planar to interleaved bandwidth %.3e byte/s.\n", buffer_size / t);
+    printf("Planar to interleaved bandwidth %.3e byte/s.\n",
+           dst_image.number_of_elements() / t);
 
-    for (int32_t x = 0; x < buffer_side_length; x++) {
-        for (int32_t y = 0; y < buffer_side_length; y++) {
+    dst_image.for_each_element([&](int x, int y) {
             assert(dst_image(x, y, 0) == 0);
             assert(dst_image(x, y, 1) == 128);
             assert(dst_image(x, y, 2) == 255);
-        }
-    }
-
-    delete[] src_storage;
-    delete[] dst_storage;
+        });
 }
 
 int main(int argc, char **argv) {
diff --git a/test/performance/wrap.cpp b/test/performance/wrap.cpp
index 639b577889be..e3ea91e51ffa 100644
--- a/test/performance/wrap.cpp
+++ b/test/performance/wrap.cpp
@@ -125,23 +125,20 @@ int main(int argc, char **argv) {
     Image<int> out1(1000, 1000);
     Image<int> out2(1000, 1000);
     Image<int> out3(1000, 1000);
-    Buffer buf1(out1);
-    Buffer buf2(out2);
-    Buffer buf3(out3);
 
     double shared_time = benchmark(5, 5, [&]() {
-            use_shared.realize(buf1);
-            buf1.device_sync();
+            use_shared.realize(out1);
+            out1.device_sync();
         });
 
     double l1_time = benchmark(5, 5, [&]() {
-            use_l1.realize(buf2);
-            buf2.device_sync();
+            use_l1.realize(out2);
+            out2.device_sync();
         });
 
     double wrap_time = benchmark(5, 5, [&]() {
-            use_wrap_for_shared.realize(buf3);
-            buf3.device_sync();
+            use_wrap_for_shared.realize(out3);
+            out3.device_sync();
         });
 
     // Check correctness of the wrapper version
diff --git a/test/renderscript/aot_copy.cpp b/test/renderscript/aot_copy.cpp
index e01c4bdcff71..6827e1343e62 100644
--- a/test/renderscript/aot_copy.cpp
+++ b/test/renderscript/aot_copy.cpp
@@ -3,50 +3,34 @@
 using namespace Halide;
 using namespace Halide::Internal;
 
-Image<uint8_t> make_interleaved_image(uint8_t *host, int W, int H, int channels) {
-    buffer_t buf = {0};
-    buf.host = host;
-    buf.extent[0] = W;
-    buf.stride[0] = channels;
-    buf.extent[1] = H;
-    buf.stride[1] = buf.stride[0] * buf.extent[0];
-    buf.extent[2] = channels;
-    buf.stride[2] = 1;
-    buf.elem_size = 1;
-    return Image<uint8_t>(&buf);
-}
-
 void copy_interleaved(bool vectorize, int channels) {
     ImageParam input8(UInt(8), 3, "input");
-    input8.set_stride(0, channels)
-        .set_stride(1, Halide::Expr())
-        .set_stride(2, 1)
-        .set_bounds(2, 0, channels);  // expecting interleaved image
-    uint8_t *in_buf = new uint8_t[128 * 128 * channels];
-    uint8_t *out_buf = new uint8_t[128 * 128 * channels];
-    Image<uint8_t> in = make_interleaved_image(in_buf, 128, 128, channels);
-    Image<uint8_t> out = make_interleaved_image(out_buf, 128, 128, channels);
+    input8
+        .dim(0).set_stride(channels)
+        .dim(2).set_stride(1).set_bounds(0, channels);
+
+    Image<uint8_t> in = Image<uint8_t>::make_interleaved(128, 128, channels);
+    Image<uint8_t> out = Image<uint8_t>::make_interleaved(128, 128, channels);
     input8.set(in);
 
     Var x, y, c;
     Func result("result");
     result(x, y, c) = input8(x, y, c);
+
     result.output_buffer()
-        .set_stride(0, channels)
-        .set_stride(1, Halide::Expr())
-        .set_stride(2, 1)
-        .set_bounds(2, 0, channels);  // expecting interleaved image
+        .dim(0).set_stride(channels)
+        .dim(2).set_stride(1).set_bounds(0, channels);
 
     result.bound(c, 0, channels);
     result.shader(x, y, c, DeviceAPI::Renderscript);
-    if (vectorize) result.vectorize(c);
+    if (vectorize) {
+        result.vectorize(c);
+    }
 
     std::vector<Argument> args;
     args.push_back(input8);
 
     result.compile_to_file("aot_copy", args);
-    delete[] in_buf;
-    delete[] out_buf;
 }
 
 int main(int argc, char **argv) {
diff --git a/test/renderscript/aot_copy_error.cpp b/test/renderscript/aot_copy_error.cpp
index 102a0abd92aa..d1321af0e9ce 100644
--- a/test/renderscript/aot_copy_error.cpp
+++ b/test/renderscript/aot_copy_error.cpp
@@ -3,58 +3,37 @@
 using namespace Halide;
 using namespace Halide::Internal;
 
-Image<uint8_t> make_interleaved_image(uint8_t *host, int W, int H,
-                                      int nChannels) {
-    buffer_t buf = { 0 };
-    buf.host = host;
-    buf.extent[0] = W;
-    buf.stride[0] = nChannels;
-    buf.extent[1] = H;
-    buf.stride[1] = buf.stride[0] * buf.extent[0];
-    buf.extent[2] = nChannels;
-    buf.stride[2] = 1;
-    buf.elem_size = 1;
-    return Image<uint8_t>(&buf);
-}
-
 void copy_interleaved(bool vectorize, int channels) {
     ImageParam input8(UInt(8), 3, "input");
-    input8.set_stride(0, channels)
-        .set_stride(1, Halide::Expr())
-        .set_stride(2, 1)
-        .set_bounds(2, 0, channels);  // expecting interleaved image
-    uint8_t *in_buf = new uint8_t[128 * 128 * channels];
-    uint8_t *out_buf = new uint8_t[128 * 128 * channels];
-    Image<uint8_t> in = make_interleaved_image(in_buf, 128, 128, channels);
-    Image<uint8_t> out = make_interleaved_image(out_buf, 128, 128, channels);
+    input8
+        .dim(0).set_stride(channels)
+        .dim(2).set_stride(1).set_bounds(0, channels);  // expecting interleaved image
+    Image<uint8_t> in = Image<uint8_t>::make_interleaved(128, 128, channels);
+    Image<uint8_t> out = Image<uint8_t>::make_interleaved(128, 128, channels);
     input8.set(in);
 
     Var x, y, c;
     Func result("result");
     result(x, y, c) = input8(x, y, c);
     result.output_buffer()
-        .set_stride(0, channels)
-        .set_stride(1, Halide::Expr())
-        .set_stride(2, 1)
-        .set_bounds(2, 0, channels);  // expecting interleaved image
+        .dim(0).set_stride(channels)
+        .dim(2).set_stride(1).set_bounds(0, channels);
 
     result.bound(c, 0, channels);
     result.shader(x, y, c, DeviceAPI::Renderscript);
-    if (vectorize) result.vectorize(c);
+    if (vectorize) {
+        result.vectorize(c);
+    }
 
     std::vector<Argument> args;
     args.push_back(input8);
 
     result.compile_to_file("aot_copy_error", args);
-    delete[] in_buf;
-    delete[] out_buf;
 }
 
 int main(int argc, char **argv) {
-    const bool VECTORIZE = true;
-
-    copy_interleaved(VECTORIZE, 3);
+    copy_interleaved(true, 3);
 
-    std::cout << "Done!" << std::endl;
+    printf("Success!\n");
     return 0;
 }
diff --git a/test/renderscript/jit_copy.cpp b/test/renderscript/jit_copy.cpp
index 57e8398c99a8..659cb4f5c4ca 100644
--- a/test/renderscript/jit_copy.cpp
+++ b/test/renderscript/jit_copy.cpp
@@ -180,27 +180,13 @@ class ValidateInterleavedVectorizedPipeline: public ValidateInterleavedPipeline
     ValidateInterleavedVectorizedPipeline(int _channels) : ValidateInterleavedPipeline(_channels) {}
 };
 
-Image<uint8_t> make_interleaved_image(uint8_t *host, int W, int H, int channels) {
-    buffer_t buf = { 0 };
-    buf.host = host;
-    buf.extent[0] = W;
-    buf.stride[0] = channels;
-    buf.extent[1] = H;
-    buf.stride[1] = buf.stride[0] * buf.extent[0];
-    buf.extent[2] = channels;
-    buf.stride[2] = 1;
-    buf.elem_size = 1;
-    return Image<uint8_t>(&buf);
-}
-
 void copy_interleaved(bool vectorized = false, int channels = 4) {
     ImageParam input8(UInt(8), 3, "input");
     input8.set_stride(0, channels)
         .set_stride(1, Halide::Expr())
         .set_stride(2, 1)
         .set_bounds(2, 0, channels);  // expecting interleaved image
-    uint8_t *in_buf = new uint8_t[128 * 128 * channels];
-    Image<uint8_t> in = make_interleaved_image(in_buf, 128, 128, channels);
+    Image<uint8_t> in = Image<uint8_t>::make_interleaved(128, 128, channels);
     input8.set(in);
 
     Var x, y, c;
@@ -214,16 +200,16 @@ void copy_interleaved(bool vectorized = false, int channels = 4) {
 
     result.bound(c, 0, channels);
     result.shader(x, y, c, DeviceAPI::Renderscript);
-    if (vectorized) result.vectorize(c);
+    if (vectorized) {
+        result.vectorize(c);
+    }
 
     result.add_custom_lowering_pass(
-        vectorized?
+        vectorized ?
             new ValidateInterleavedVectorizedPipeline(channels):
             new ValidateInterleavedPipeline(channels));
 
     result.compile_jit();
-
-    delete[] in_buf;
 }
 
 int main(int argc, char **argv) {
diff --git a/tutorial/lesson_12_using_the_gpu.cpp b/tutorial/lesson_12_using_the_gpu.cpp
index d9fc1209db21..e26aaca07a51 100644
--- a/tutorial/lesson_12_using_the_gpu.cpp
+++ b/tutorial/lesson_12_using_the_gpu.cpp
@@ -162,7 +162,7 @@ class MyPipeline {
         // Use the GPU threads for the x and y coordinates of the
         // padded input.
         padded.gpu_threads(x, y);
-        
+
         // JIT-compile the pipeline for the GPU. CUDA, OpenCL, or
         // Metal are not enabled by default. We have to construct a
         // Target object, enable one of them, and then pass that
@@ -201,14 +201,7 @@ class MyPipeline {
     void test_performance() {
         // Test the performance of the scheduled MyPipeline.
 
-        // If we realize curved into a Halide::Image, that will
-        // unfairly penalize GPU performance by including a GPU->CPU
-        // copy in every run. Halide::Image objects always exist on
-        // the CPU.
-
-        // Halide::Buffer, however, represents a buffer that may
-        // exist on either CPU or GPU or both.
-        Buffer output(UInt(8), input.width(), input.height(), input.channels());
+        Image<uint8_t> output(input.width(), input.height(), input.channels());
 
         // Run the filter once to initialize any GPU runtime state.
         curved.realize(output);
@@ -242,6 +235,13 @@ class MyPipeline {
         Image<uint8_t> output =
             curved.realize(input.width(), input.height(), input.channels());
 
+        // Halide by default does not copy the data back from the GPU
+        // (you might want to keep it there if you're going to feed it
+        // into another GPU pipeline). We can request that it be
+        // copied back like so:
+        printf("%llx\n", output.raw_buffer()->dev);
+        output.copy_to_host();
+
         // Check against the reference output.
         for (int c = 0; c < input.channels(); c++) {
             for (int y = 0; y < input.height(); y++) {